# Institutional Network from Microsoft Academic Graph

## Geocode Affiliations

Attempt to automatically geocode institutions. Not all institutions will be able to be found and geocoded by our chosen (free) geocoding software. And some results will simply be wrong. To view the output of the code in the cell below, take a look at `geodata/geocoded.csv`.

In [None]:
import csv
import googlemaps

geocoded_data = {}

with open("geodata/geocoded.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        place = row["place"]
        geocoded_data[place] = row
        del geocoded_data[place]["place"]
        
gmaps = googlemaps.Client(key='[insert key]')
path_to_data = "sourcedata/mag_sdg.csv"

with open(path_to_data) as datafile:
    
    reader = csv.DictReader(datafile)
    for row in reader:
        place = row["affiliation_name"]
        if place not in geocoded_data:
            
            try:
                # Take first result with [0]
                geocode_result = gmaps.geocode(place)[0]
                country = None
                for section in geocode_result["address_components"]:
                    if "country" in section["types"]:
                        country = section["long_name"]
                
                locdata = {
                    "address": geocode_result["formatted_address"],
                    "latitude": geocode_result["geometry"]["location"]["lat"],
                    "longitude": geocode_result["geometry"]["location"]["lng"],
                    "country": country 
                }
                geocoded_data[place] = locdata
                print(place, locdata["country"])
                
            except Exception as e:
                geocoded_data[place] = "Null"
                print("====", e)
                print("====", f"Unable to locate {place}")

In [43]:
len(geocoded_data)

4705

In [44]:
len([g for g in geocoded_data if geocoded_data[g] == "Null"])

0

In [47]:
# Output data to file.
with open("geodata/geocoded.csv", "w") as f:
    
    writer = csv.DictWriter(f, fieldnames=["place", "address", "latitude", "longitude", "country"])
    writer.writeheader()
    for g, data in geocoded_data.items():
        # print(type(data))
        if data == "Null":
            data = {"address": "", "latitude": "", "longitude": "", "country": ""}
        data["place"] = g
        writer.writerow(data)

Loading a data file that lists countries and continents ([Source](https://datahub.io/JohnSnowLabs/country-and-continent-codes-list))

In [7]:
import json
with open("sourcedata/country-and-continent-codes-list.json") as f:
    countries = json.load(f)

Creating a lookup devices for finding countries by name, then countries and other geo-information by affiliation.

In [8]:
countries_by_name = {}
for country in countries:
    country_name = country["Country_Name"].split(",")[0].strip()
    countries_by_name[country_name] = country
    
with open("sourcedata/countries_by_name.json", "w") as f:
    json.dump(countries_by_name, f)

At this point, `geocoded.csv` has been updated manually with country names not found by the Google geocoder (in this case about 250 values).

In [9]:
import csv

geo_by_affiliation = {}
with open("geodata/geocoded.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        geo_by_affiliation[row["place"]] = row
        
import json
with open("geodata/geo_by_affiliation.json", "w") as f:
    json.dump(geo_by_affiliation, f)

### Extracting data from CSV

Fill out several dictionaries with data from the csv file.
* **data_by_id**: article information keyed by article id.
* **data_by_inst**: data about institutions keyed by the name of the institution.

In [None]:
import os
import csv
import json

data_by_id = {}
data_by_inst = {}

missing_country_values = {
    "Kashmir": "Asia",
    "Africa": "Africa"
}

with open("geodata/geo_by_affiliation.json") as f:
    geo_by_affiliation = json.load(f)
    
with open("sourcedata/countries_by_name.json") as f:
    countries_by_name = json.load(f)

with open("sourcedata/mag_sdg.csv") as datafile:
    
    reader = csv.DictReader(datafile)
    
    for row in reader:
        
        id_ = row["paper_id"]
        author_id = row["author_id"]
        author = row["author_name"]
        inst = row["affiliation_name"]
        
        if id_ not in data_by_id:
            data_by_id[id_] = [inst]
        else:
            if inst not in data_by_id[id_]:
                data_by_id[id_].append(inst)
        
        if inst not in data_by_inst:
            data_by_inst[inst] = {
                "affiliation_id": [row["affiliation_id"]],
                "affiliation_name": [row["affiliation_name"]],
                "author_name": [row["author_name"]],
                "doi": [row["doi"]] if row["doi"] != "" else [],
                "paper_id": [row["paper_id"]],
                "year": [row["year"]],                
            }
            if inst in geo_by_affiliation:
                country = geo_by_affiliation[inst]["country"]
            else:
                country = ""
                print("Missing institution", inst)
                
            if country != "":
                if country in countries_by_name:
                    continent = countries_by_name[country]["Continent_Name"]
                else:
                    print("Missing country", country)
                    continent = missing_country_values.get(country, "Unknown")
            else:
                print("Missing country", country)
                country = "Unknown"
                continent = "Unknown"
            
            data_by_inst[inst]["country"] = country
            data_by_inst[inst]["continent"] = continent
        
        else:
            doi = row["doi"]
            if doi != "":
                data_by_inst[inst]["doi"].append(doi)
            for key in ["affiliation_id", "affiliation_name", "author_name", "paper_id", "year"]:
                data_by_inst[inst][key].append(row[key])
            
            

#### Check values for accuracy / highlight obvious issues

In [11]:
data_by_inst['university of tasmania'].keys()

dict_keys(['affiliation_id', 'affiliation_name', 'author_name', 'doi', 'paper_id', 'year', 'country', 'continent'])

In [12]:
list(data_by_inst.keys())[0]

'university of tasmania'

In [None]:
data_by_inst['university of tasmania']

### Building graph from institution and article data

First, create the complete graph of all institutional collaborations.

In [70]:
from itertools import combinations
import networkx as nx

G = nx.Graph()

for id_, data in data_by_id.items():
    
    for i, j in combinations(data, 2):

        if G.has_edge(i, j):
            G[i][j]["weight"] += 1
            if id_ not in G[i][j]["articles"]:
                G[i][j]["articles"].append(id_)
            
        else:
            # Attempt to determine 'scale' of geographic collaboration.
            if data_by_inst[i]["country"] == "Unknown" or data_by_inst[j]["country"] == "Unknown":
                connection_type = "Unknown"
            elif data_by_inst[i]["continent"] != data_by_inst[j]["continent"]:
                connection_type = "trans-continental"
            elif data_by_inst[i]["country"] != data_by_inst[j]["country"]:
                connection_type = "international"
            elif data_by_inst[i]["country"] == data_by_inst[j]["country"]:
                connection_type = "domestic"
                        
            G.add_edge(i, j, weight=1, articles=[id_], connection=connection_type)

for inst, data in G.nodes(data=True):
    data["affiliation"] = data_by_inst[inst]["affiliation_name"][0]
    data["affiliation_label"] = data_by_inst[inst]["affiliation_name"][0].title()
    data["papers"] = "<br>\n".join(data_by_inst[inst]["paper_id"])
    data["dois"] = "<br>\n".join(data_by_inst[inst]["doi"])
    data["name"] = "<br>\n".join(list(set(data_by_inst[inst]["author_name"])))    
    data["count"] = len(data_by_inst[inst]["paper_id"])
    data["country"] = data_by_inst[inst]["country"]
    data["continent"] = data_by_inst[inst]["continent"]
    data["domestic"] = len([x for x in G[inst] if G[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in G[inst] if G[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]
    max_year = max(data_by_inst[inst]["year"])
    min_year = min(data_by_inst[inst]["year"])
    if max_year == min_year:
        data["year"] = max_year
    else:
        data["year"] = "{0}–{1}".format(min_year, max_year)

In [71]:
len(G.edges)

20411

In [72]:
len(G.nodes)

3454

Create subgraph containing only those edges with a `weight` greater than 1.

In [77]:
def filter_edge(n1, n2):
    """Check if weight is larger than 1."""
    return G[n1][n2]["weight"] > 1

def filter_node(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(view, n)

view = nx.subgraph_view(G, filter_edge=filter_edge)
subview = nx.subgraph_view(view, filter_node=filter_node)

for inst, data in subview.nodes(data=True):
    
    data["domestic"] = len([x for x in subview[inst] if subview[inst][x]["connection"] == "domestic"])
    data["international"] = len([x for x in subview[inst] if subview[inst][x]["connection"] 
                 in ["trans-continental", "international"]])
    data["domestic_pct"] = int(round(data["domestic"] / (data["domestic"] + data["international"]) * 100))
    data["international_pct"] = int(round(data["international"] / (data["domestic"] + data["international"]) * 100))
    data["degree"] = data["domestic"] + data["international"]

In [74]:
len(subview.edges()), len(subview.nodes())

(2389, 828)

In [75]:
len(list(nx.isolates(subview)))

0

In [None]:
for e1, e2 in subview.edges():

    weight = subview[e1][e2]["weight"]
    if weight > 4:
        print("-------------")
        print(e1, e2)
        print(subview[e1][e2]["weight"])

In [None]:
for auth_id, auth_data in data_by_author.items():
    if any(len(auth_data[key]) > 2 for key in auth_data.keys()):
        print(auth_data)

In [19]:
list(data_by_id.keys())[0]

'420720'

In [19]:
data_by_id["524975"]

['kasetsart university',
 'hiroshima shudo university',
 'university of shiga prefecture']

### Output graph data as separate files containing `nodes` and `edges` for display in `flourish.studio`.

In [78]:
import csv

nx.write_edgelist(subview, "mag_output_inst/mag_inst_edges.tsv", delimiter="\t", data=["weight", "connection"])
with open("mag_output_inst/mag_inst_nodes.csv", "w") as f:
    fieldnames = ["id", "country", "continent", "affiliation", "papers", "name", "year", "count",
                 "affiliation_label", "dois", 'degree', 'domestic', 'domestic_pct', 'international_pct',
                  'international']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for n, data in list(subview.nodes(data=True)):
        row = data
        row["id"] = n
        writer.writerow(row)

In [21]:
max([subview[e][v]["weight"] for e, v in subview.edges])

41

### Centrality measures

In [None]:
cc = nx.closeness_centrality(subview)
print(cc)
bc = nx.betweenness_centrality(subview)
print(bc)

In [32]:
dc = nx.degree_centrality(subview)
ec = nx.eigenvector_centrality(subview)

In [None]:
from operator import itemgetter
from pprint import pprint

measure = ec

measure_sorted = sorted(measure.items(), key=itemgetter(1), reverse=True)
pprint(measure_sorted)

In [35]:
import csv

centralities = []

insts = list(ec.keys())

with open("mag_output_inst/centrality_by_institution.csv", "w") as f:
    
    writer = csv.DictWriter(f, fieldnames=["institution", "betweenness", "closeness", "degree", "eigenvector"])
    writer.writeheader()
    for i in insts:

        data = {
            "institution": i,
            "betweenness": bc[i],
            "closeness": cc[i],
            "degree": dc[i],
            "eigenvector": ec[i]
        }
        writer.writerow(data)