# Institutional Network from Microsoft Academic Graph

## Geocode Affiliations

Attempt to automatically geocode institutions. Not all institutions will be able to be found and geocoded by our chosen (free) geocoding software. And some results will simply be wrong. To view the output of the code in the cell below, take a look at `geodata/geocoded.csv`.

In [None]:
import csv
import googlemaps

geocoded_data = {}

with open("geodata/geocoded.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        place = row["place"]
        geocoded_data[place] = row
        del geocoded_data[place]["place"]
        
gmaps = googlemaps.Client(key='[insert key]')
path_to_data = "sourcedata/mag_sdg.csv"

with open(path_to_data) as datafile:
    
    reader = csv.DictReader(datafile)
    for row in reader:
        place = row["affiliation_name"]
        if place not in geocoded_data:
            
            try:
                # Take first result with [0]
                geocode_result = gmaps.geocode(place)[0]
                country = None
                for section in geocode_result["address_components"]:
                    if "country" in section["types"]:
                        country = section["long_name"]
                
                locdata = {
                    "address": geocode_result["formatted_address"],
                    "latitude": geocode_result["geometry"]["location"]["lat"],
                    "longitude": geocode_result["geometry"]["location"]["lng"],
                    "country": country 
                }
                geocoded_data[place] = locdata
                print(place, locdata["country"])
                
            except Exception as e:
                geocoded_data[place] = "Null"
                print("====", e)
                print("====", f"Unable to locate {place}")

In [43]:
len(geocoded_data)

4705

In [44]:
len([g for g in geocoded_data if geocoded_data[g] == "Null"])

0

In [47]:
# Output data to file.
with open("geodata/geocoded.csv", "w") as f:
    
    writer = csv.DictWriter(f, fieldnames=["place", "address", "latitude", "longitude", "country"])
    writer.writeheader()
    for g, data in geocoded_data.items():
        # print(type(data))
        if data == "Null":
            data = {"address": "", "latitude": "", "longitude": "", "country": ""}
        data["place"] = g
        writer.writerow(data)

Loading a data file that lists countries and continents ([Source](https://datahub.io/JohnSnowLabs/country-and-continent-codes-list))

In [7]:
import json
with open("sourcedata/country-and-continent-codes-list.json") as f:
    countries = json.load(f)

Creating a lookup devices for finding countries by name, then countries and other geo-information by affiliation.

In [8]:
countries_by_name = {}
for country in countries:
    country_name = country["Country_Name"].split(",")[0].strip()
    countries_by_name[country_name] = country
    
with open("sourcedata/countries_by_name.json", "w") as f:
    json.dump(countries_by_name, f)

At this point, `geocoded.csv` has been updated manually with country names not found by the Google geocoder (in this case about 250 values).

In [9]:
import csv

geo_by_affiliation = {}
with open("geodata/geocoded.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        geo_by_affiliation[row["place"]] = row
        
import json
with open("geodata/geo_by_affiliation.json", "w") as f:
    json.dump(geo_by_affiliation, f)

### Extracting data from CSV

Fill out several dictionaries with data from the csv file.
* **data_by_id**: article information keyed by article id.
* **data_by_inst**: data about institutions keyed by the name of the institution.

In [1]:
import os
import csv
import json

data_by_id = {}
data_by_inst = {}
year_by_id = {}

missing_country_values = {
    "Kashmir": "Asia",
    "Africa": "Africa"
}

with open("geodata/geo_by_affiliation.json") as f:
    geo_by_affiliation = json.load(f)
    
with open("sourcedata/countries_by_name.json") as f:
    countries_by_name = json.load(f)

with open("sourcedata/mag_sdg.csv") as datafile:
    
    reader = csv.DictReader(datafile)
    
    for row in reader:
        
        id_ = row["paper_id"]
        author_id = row["author_id"]
        author = row["author_name"]
        inst = row["affiliation_name"]
        year = row["year"]
        year_by_id[id_] = year
        
        if id_ not in data_by_id:
            data_by_id[id_] = [inst]
        else:
            if inst not in data_by_id[id_]:
                data_by_id[id_].append(inst)
        
        if inst not in data_by_inst:
            data_by_inst[inst] = {
                "affiliation_id": [row["affiliation_id"]],
                "affiliation_name": [row["affiliation_name"]],
                "author_name": [row["author_name"]],
                "doi": [row["doi"]] if row["doi"] != "" else [],
                "paper_id": [row["paper_id"]],
                "year": [row["year"]],                
            }
            if inst in geo_by_affiliation:
                country = geo_by_affiliation[inst]["country"]
            else:
                country = ""
                print("Missing institution", inst)
                
            if country != "":
                if country in countries_by_name:
                    continent = countries_by_name[country]["Continent_Name"]
                else:
                    print("Missing country", country)
                    continent = missing_country_values.get(country, "Unknown")
            else:
                print("Missing country", country)
                country = "Unknown"
                continent = "Unknown"
            
            data_by_inst[inst]["country"] = country
            data_by_inst[inst]["continent"] = continent
        
        else:
            doi = row["doi"]
            if doi != "":
                data_by_inst[inst]["doi"].append(doi)
            for key in ["affiliation_id", "affiliation_name", "author_name", "paper_id", "year"]:
                data_by_inst[inst][key].append(row[key])
            
            

Missing country Kashmir
Missing country Kashmir
Missing country Africa
Missing country Kashmir
Missing country Kashmir
Missing country Kashmir
Missing country Kashmir


#### Check values for accuracy / highlight obvious issues

In [10]:
data_by_inst['university of tasmania'].keys()

dict_keys(['affiliation_id', 'affiliation_name', 'author_name', 'doi', 'paper_id', 'year', 'country', 'continent'])

In [11]:
list(data_by_inst.keys())[0]

'university of tasmania'

In [None]:
data_by_inst['university of tasmania']

### Building graph from institution and article data

First, create the complete graph of all institutional collaborations.

In [2]:
from itertools import combinations
import networkx as nx

G = nx.Graph()

for id_, data in data_by_id.items():
    
    for i, j in combinations(data, 2):

        if G.has_edge(i, j):
            G[i][j]["weight"] += 1
            G[i][j]["years"].append(year_by_id[id_])
            if id_ not in G[i][j]["articles"]:
                G[i][j]["articles"].append(id_)
            
        else:
            # Attempt to determine 'scale' of geographic collaboration.
            if data_by_inst[i]["country"] == "Unknown" or data_by_inst[j]["country"] == "Unknown":
                connection_type = "Unknown"
            elif data_by_inst[i]["continent"] != data_by_inst[j]["continent"]:
                connection_type = "trans-continental"
            elif data_by_inst[i]["country"] != data_by_inst[j]["country"]:
                connection_type = "international"
            elif data_by_inst[i]["country"] == data_by_inst[j]["country"]:
                connection_type = "domestic"
                        
            G.add_edge(i, j, weight=1, articles=[id_], connection=connection_type, years=[year_by_id[id_]])

for inst, data in G.nodes(data=True):
    data["affiliation"] = data_by_inst[inst]["affiliation_name"][0]
    data["affiliation_label"] = data_by_inst[inst]["affiliation_name"][0].title()
    data["papers"] = "<br>\n".join(data_by_inst[inst]["paper_id"])
    data["dois"] = "<br>\n".join(data_by_inst[inst]["doi"])
    data["name"] = "<br>\n".join(list(set(data_by_inst[inst]["author_name"])))    
    data["count"] = len(data_by_inst[inst]["paper_id"])
    data["country"] = data_by_inst[inst]["country"]
    data["continent"] = data_by_inst[inst]["continent"]
    data["domestic"] = len([x for x in G[inst] if G[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in G[inst] if G[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]
    data["yearspan"] = sorted(data_by_inst[inst]["year"])
    max_year = max(data_by_inst[inst]["year"])
    min_year = min(data_by_inst[inst]["year"])
    if max_year == min_year:
        data["year"] = max_year
    else:
        data["year"] = "{0}–{1}".format(min_year, max_year)

In [3]:
len(G.edges)

20116

In [4]:
len(G.nodes)

3419

Create subgraph containing only those edges with a `weight` greater than 1.

In [144]:
def filter_edge(n1, n2):
    """Check if weight is larger than 1."""
    return G[n1][n2]["weight"] > 1

def filter_node(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(view, n)

view = nx.subgraph_view(G, filter_edge=filter_edge)
subview = nx.subgraph_view(view, filter_node=filter_node)
print(len(subview.edges()), len(subview.nodes()))

for inst, data in subview.nodes(data=True):
    
    data["domestic"] = len([x for x in subview[inst] if subview[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in subview[inst] if subview[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]

145 105


Split graph into two graphs, one spanning the years 1999–2008 and another for 2009–2018.

In [5]:
G1 = nx.Graph()
G2 = nx.Graph()

def get_index_point(values, greater_than=2008):
    """Find index in list after which values are greater than
    `greater_than` parameter."""
    index = len(values)
    for i,v in enumerate(values):
        if int(v) > greater_than:
            index = i
            break
    return index
        
for e in G.edges(data=True):

    years = e[2]["years"]
    split_index = get_index_point(years)
    if years[0:split_index]:
        g1_years = years[0:split_index]
        g1_data = {
            "weight": len(g1_years),
            "years": g1_years,
            "articles": e[2]["articles"][0:split_index],
            "connection": e[2]["connection"]
        }
        G1.add_edge(e[0], e[1], **g1_data)
    if years[split_index:]:
        g2_years = years[split_index:]
        g2_data = {
            "weight": len(g2_years),
            "years": g2_years,
            "articles": e[2]["articles"][split_index:],
            "connection": e[2]["connection"]
        }
        G2.add_edge(e[0], e[1], **g2_data)

print(len(G1.nodes), len(G2.nodes))
print(len(G1.edges), len(G2.edges))
for graph in [G1, G2]:
    for inst, data in graph.nodes(data=True):
        data["affiliation"] = data_by_inst[inst]["affiliation_name"][0]
        data["affiliation_label"] = data_by_inst[inst]["affiliation_name"][0].title()
        data["papers"] = "<br>\n".join(data_by_inst[inst]["paper_id"])
        data["dois"] = "<br>\n".join(data_by_inst[inst]["doi"])
        data["name"] = "<br>\n".join(list(set(data_by_inst[inst]["author_name"])))    
        data["count"] = len(data_by_inst[inst]["paper_id"])
        data["country"] = data_by_inst[inst]["country"]
        data["continent"] = data_by_inst[inst]["continent"]
        data["domestic"] = len([x for x in graph[inst] if graph[inst][x]["connection"] == "domestic"])
        data["international"] = len([x for x in graph[inst] if graph[inst][x]["connection"] 
                     in ["trans-continental", "international"]])
        data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
        data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
        data["degree"] = data["domestic"] + data["international"]
        data["yearspan"] = sorted(data_by_inst[inst]["year"])
        max_year = max(data_by_inst[inst]["year"])
        min_year = min(data_by_inst[inst]["year"])
        if max_year == min_year:
            data["year"] = max_year
        else:
            data["year"] = "{0}–{1}".format(min_year, max_year)

767 3257
1529 18970


Filter each graph by weight > 1

In [10]:
# G1
G1subview = None

def filter_edge(n1, n2):
    """Check if weight is larger than 1."""
    return G1[n1][n2]["weight"] > 1

def filter_node(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(G1view, n)

G1view = nx.subgraph_view(G1, filter_edge=filter_edge)
G1subview = nx.subgraph_view(G1view, filter_node=filter_node)

for inst, data in G1subview.nodes(data=True):

    data["domestic"] = len([x for x in G1subview[inst] if G1subview[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in G1subview[inst] if G1subview[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]

print(len(G1subview.edges()), len(G1subview.nodes()))


145 105


In [11]:
# G2

def filter_edge(n1, n2):
    """Check if weight is larger than 1."""
    return G2[n1][n2]["weight"] > 1

def filter_node(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(G2view, n)

G2view = nx.subgraph_view(G2, filter_edge=filter_edge)
G2subview = nx.subgraph_view(G2view, filter_node=filter_node)

for inst, data in G2subview.nodes(data=True):
    
    data["domestic"] = len([x for x in G2subview[inst] if G2subview[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in G2subview[inst] if G2subview[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]

print(len(G2subview.edges()), len(G2subview.nodes()))


2089 751


In [13]:
C_GRAPH = [G1subview, G2subview]

for g in C_GRAPH:
    print("--------------")
    print(nx.info(g))


--------------
Name: 
Type: Graph
Number of nodes: 105
Number of edges: 145
Average degree:   2.7619
--------------
Name: 
Type: Graph
Number of nodes: 751
Number of edges: 2089
Average degree:   5.5632


In [14]:
for g in C_GRAPH:
    print(nx.density(g))

0.026556776556776556
0.007417665335108744


In [15]:
from networkx import transitivity
for g in C_GRAPH:
    print(transitivity(g))

0.36705882352941177
0.19259289606292762


In [16]:
components = []
components_lens = []
for g in C_GRAPH:
    c = list(nx.connected_components(g))
    components.append(c)
    components_lens.append(len(c))
for x in components_lens:
    print(x)

20
49


In [17]:
largest = []
subgraphs = {}

for i,c in enumerate(components):
    print(i)
    largest_component = max(c, key=len)
    subgraphs[i] = C_GRAPH[i].subgraph(largest_component)
    # diameter = nx.diameter(subgraph)
    # print("Network diameter of largest component:", diameter)

for i,g in enumerate(components):
    print("Graph", i)
    for c in g:
        print(len(c))

0
1


In [19]:
from networkx.algorithms import community
mod_communities = []
for s in subgraphs:
    print("----------------------")
    cgraph = subgraphs[s]
    communities = community.greedy_modularity_communities(cgraph)
    mod_communities.append(communities)
    for i,c in enumerate(communities):
        if len(c) > 2: # Filter out modularity classes with 2 or fewer nodes
            print('Class '+str(i)+':', len(list(c)))

----------------------
Class 0: 42
Class 1: 4
Class 2: 4
Class 3: 3
Class 4: 3
----------------------
Class 0: 496
Class 1: 48
Class 2: 11
Class 3: 10
Class 4: 6
Class 5: 5
Class 6: 5
Class 7: 5
Class 8: 4
Class 9: 4
Class 10: 4
Class 11: 3
Class 12: 3
Class 13: 3
Class 14: 3
Class 15: 3


In [20]:
for j, mc in enumerate(mod_communities):
    
    modularity_dict = {} # Create a blank dictionary
    for i,c in enumerate(mc): # Loop through the list of communities, keeping track of the number for the community
        for name in c: # Loop through each person in a community
            modularity_dict[name] = i # Create an entry in the dictionary for the person, where the value is which group they belong to.

    # Now you can add modularity information like we did the other metrics
    nx.set_node_attributes(subgraphs[j], modularity_dict, 'modularity')

Create egograph.

In [27]:
node_of_interest = "world health organization"
def filter_edge_ego(n1, n2):
    """Check if weight is larger than 1."""
    return node_of_interest in [n1, n2] and G[n1][n2]["weight"] > 1

def filter_node_ego(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(ego_view_edge, n)

ego_view_edge = nx.subgraph_view(G, filter_edge=filter_edge_ego)
ego_view = nx.subgraph_view(ego_view_edge, filter_node=filter_node_ego)
print(len(ego_view.edges()), len(ego_view.nodes()))

for inst, data in ego_view.nodes(data=True):
    
    data["domestic"] = len([x for x in ego_view[inst] if ego_view[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in ego_view[inst] if ego_view[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]

119 120


Create subgraph containing only the most closely connected nodes.

In [41]:
from operator import itemgetter
measure_sorted = sorted(dc.items(), key=itemgetter(1), reverse=True)
nodes_to_keep = [n[0] for n in measure_sorted[:20]]

def filter_node_for_value(n):
    """Filter out unconnected nodes."""
    return n in nodes_to_keep

closeness_view = nx.subgraph_view(subview, filter_node=filter_node_for_value)

print(len(closeness_view.edges()), len(closeness_view.nodes()))

for inst, data in closeness_view.nodes(data=True):

    data["domestic"] = len([x for x in closeness_view[inst] if closeness_view[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in closeness_view[inst] if closeness_view[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]

113 20


In [None]:
for x,y in measure_sorted:
    print(x,y)

In [19]:
len(subview.edges()), len(subview.nodes())

(2389, 828)

### Output graph data as separate files containing `nodes` and `edges` for display in `flourish.studio`.

In [22]:
# Output subview

import csv

graph_to_print = subgraphs[1]
name = "weight_1_component_1_time_2"

nx.write_edgelist(graph_to_print, "mag_output_inst/mag_inst_edges_{0}.tsv".format(name), delimiter="\t", data=["weight", "years", "connection"])
with open("mag_output_inst/mag_inst_nodes_{0}.csv".format(name), "w") as f:
    fieldnames = ["id", "country", "continent", "affiliation", "papers", "name", "year", "count",
                 "affiliation_label", "dois", 'degree', 'domestic', 'domestic_pct', 'international_pct',
                  'international', 'yearspan', 'modularity']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for n, data in list(graph_to_print.nodes(data=True)):
        row = data
        row["id"] = n
        writer.writerow(row)

### Centrality measures

In [None]:
cc = nx.closeness_centrality(subview)
print(cc)
bc = nx.betweenness_centrality(subview)
print(bc)

In [40]:
dc = nx.degree_centrality(subview)
print(dc)
ec = nx.eigenvector_centrality(subview)
print(ec)

{'chalmers university of technology': 0.0048367593712212815, 'sichuan university': 0.008464328899637243, 'university of california berkeley': 0.041112454655380895, 'stockholm university': 0.03264812575574365, 'university of london': 0.1414752116082225, 'makerere university': 0.022974607013301087, 'rti international': 0.013301088270858524, 'university of wisconsin madison': 0.009673518742442563, 'stanford university': 0.06166868198307134, 'university of california': 0.006045949214026602, 'portland state university': 0.0024183796856106408, 'oregon state university': 0.0048367593712212815, 'ghent university': 0.014510278113663844, 'umm al qura university': 0.0012091898428053204, 'birla institute of technology and science': 0.0012091898428053204, 'teri university': 0.0012091898428053204, 'utrecht university': 0.025392986698911726, 'kyung hee university': 0.0012091898428053204, 'purdue university': 0.0048367593712212815, 'indian agricultural research institute': 0.0012091898428053204, 'univ

In [None]:
from operator import itemgetter
from pprint import pprint

measure = ec

measure_sorted = sorted(measure.items(), key=itemgetter(1), reverse=True)
pprint(measure_sorted)

In [35]:
import csv

centralities = []

insts = list(ec.keys())

with open("mag_output_inst/centrality_by_institution.csv", "w") as f:
    
    writer = csv.DictWriter(f, fieldnames=["institution", "betweenness", "closeness", "degree", "eigenvector"])
    writer.writeheader()
    for i in insts:

        data = {
            "institution": i,
            "betweenness": bc[i],
            "closeness": cc[i],
            "degree": dc[i],
            "eigenvector": ec[i]
        }
        writer.writerow(data)