# Co-Author Network from Microsoft Academic Graph

Requires an actual path to be included as `path_to_data` in the first cell below. This csv must contain columns named as referenced below.

### Extracting data from CSV

This first cell will fill out several dictionaries with data from the csv file:
* **data_by_id**: article information keyed to article id.
* **data_by_author**: author data keyed to author name; important to note for our data there was a 1:1 relationship between `author_id` and the author's `normalized_name`

In [16]:
import os
import csv

data_by_id = {}
data_by_author = {}

path_to_data = "sourcedata/mag_sdg.csv"

# Read in source data and iterate row by row.
with open(path_to_data) as datafile:
    reader = csv.DictReader(datafile) 
    # Each row contains one article author; thus each article will have as many rows as authors.
    for row in reader:
        id_ = row["paper_id"]
        author_id = row["author_id"]
        author = row["author_name"]

        # Store data about each author for lookup by name.
        if author not in data_by_author:
            data_by_author[author] = {
                "author_name": [row["author_name"]],
                "affiliation_id": [row["affiliation_id"]],
                "last_known_affiliation_id": [row["last_known_affiliation_id"]],
                "affiliation_name": [row["affiliation_name"]],
                "doi": [row["doi"]] if row["doi"] != "" else [],
                "paper_id": [row["paper_id"]]
            }
        
        # If author is already in the dict, append additional affiliation and article info.
        else:
            for key in ["author_name", "affiliation_id", "paper_id", 
                        "last_known_affiliation_id", "doi", "affiliation_name"]:
                if row[key] not in data_by_author[author][key]:
                    data_by_author[author][key].append(row[key])
        
        # Store data about each article for lookup by article ID.
        if id_ not in data_by_id:
            data_by_id[id_] = {
                "doi": row["doi"],
                "doc_type": row["doc_type"],
                "title": row["original_title"],
                "year": row["year"],
                "authors": [
                    {
                        "author_id": row["author_id"],
                        "name": row["author_name"],
                        "last_known_affiliation_id": [row["last_known_affiliation_id"]],
                        "affiliation_id": [row["affiliation_id"]],
                        "affiliation_name": [row["affiliation_name"]],
                    }
                ],
                "all_author_names": [row["author_name"]]
            }
        # If article id already in dict, append additional author/affiliation data.
        else:
            author_name = row["author_name"]
            if author_name in data_by_id[id_]["all_author_names"]:
                for a in data_by_id[id_]["authors"]:
                    if a["name"] == author_name:
                        affiliation_id = row["affiliation_id"]
                        last_known = row["last_known_affiliation_id"]
                        normalized = row["affiliation_name"]
                        if affiliation_id not in a["affiliation_id"]:
                            a["affiliation_id"].append(affiliation_id)
                            a["affiliation_name"].append(normalized)
                            
                        if last_known not in a["last_known_affiliation_id"]:
                            a["last_known_affiliation_id"].append(last_known)
                            
            else:
                new_author = {
                    "author_id": row["author_id"],
                    "name": row["author_name"],
                    "last_known_affiliation_id": [row["last_known_affiliation_id"]],
                    "affiliation_id": [row["affiliation_id"]],
                    "affiliation_name": [row["affiliation_name"]],
                    }
                data_by_id[id_]["authors"].append(new_author)
                data_by_id[id_]["all_author_names"].append(author_name)           

#### Check values for accuracy / highlight obvious issues

In [17]:
len(data_by_author)

35174

In [18]:
list(data_by_author.keys())[0]

'e ngwenya'

In [19]:
data_by_author['e ngwenya']

{'author_name': ['e ngwenya'],
 'affiliation_id': ['129801699'],
 'last_known_affiliation_id': ['129801699'],
 'affiliation_name': ['university of tasmania'],
 'doi': ['10.1007/978-1-4899-7439-6_4', '10.1007/978-1-4899-7439-6_3'],
 'paper_id': ['420720', '1560432272']}

Many authors have multiple institutional affiliations:

In [20]:
count = 0
for author, data in data_by_author.items():
    if len(data["affiliation_name"]) > 1:
        count += 1
        # print(author, data)
print(count, "authors with multiple affiliations")

2315 authors with multiple affiliations


Compare total number of rows to number of unique rows to roughly estimate authors per paper.

In [21]:
ids = []

with open(path_to_data) as datafile:
    
    reader = csv.DictReader(datafile) 
    for row in reader:
        id_ = row["paper_id"]
        ids.append(id_)
print(len(set(ids)), "total papers,", len(ids), "total authors")

16699 total papers, 44373 total authors


### Building graph from author and article data

First, create the complete graph of all co-authorship.

In [22]:
from itertools import combinations
import networkx as nx

G = nx.Graph()

for id_, data in data_by_id.items():
    
    authors = data["all_author_names"]
    """
    Iterate over all combinations of authors per paper.
    For example if authors are Smith, Mizuno, and Garcia, iterate over 3 co-authorship combinations:
        Mizuno Smith
        Mizuno Garcia
        Smith Garcia
    Add nodes and edges as needed for each combo.  
    """
    for i, j in combinations(authors, 2):
        year = data["year"]
        if G.has_edge(i, j):
            G[i][j]["weight"] += 1
            if id_ not in G[i][j]["articles"]:
                G[i][j]["articles"].append(id_)
            
            if year not in G[i][j]["years"]:
                G[i][j]["years"].append(year)
        else:
            G.add_edge(i, j, weight=1, articles=[id_], years=[data["year"]])

# Go back and iterate over all nodes created, adding affiliation/count/author data.
for author, data in G.nodes(data=True):
    data["affiliation"] = data_by_author[author]["affiliation_name"][0]
    data["papers"] = "<br>\n".join(data_by_author[author]["doi"])
    data["name"] = ", ".join(data_by_author[author]["author_name"])    
    data["count"] = len(data_by_author[author]["paper_id"])

In [23]:
len(G.edges)

80697

In [24]:
len(G.nodes)

29492

Create subgraph containing only those edges with a `weight` greater than 1, that is, co-author pairs who published more than 1 paper in the dataset.

In [25]:
def filter_edge(n1, n2):
    """Check if weight is larger than 1."""
    return G[n1][n2]["weight"] > 1

def filter_node(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(view, n)

view = nx.subgraph_view(G, filter_edge=filter_edge)
subview = nx.subgraph_view(view, filter_node=filter_node)

In [26]:
len(subview.edges()), len(subview.nodes())

(3087, 2059)

Centrality Measures

In [13]:
from operator import itemgetter

cc = nx.closeness_centrality(subview)
bc = nx.betweenness_centrality(subview)

bc_sorted = sorted(bc.items(), key=itemgetter(1), reverse=True)
cc_sorted = sorted(cc.items(), key=itemgetter(1), reverse=True)

for name, value in bc_sorted[:50]:
    print(name, value)

joy e lawn 0.01236321467654215
jun zhu 0.006405415144562778
david sanders 0.006347757520954072
gorik ooms 0.005927307112196651
wim van damme 0.005740194542988388
robert e black 0.004718945601337216
haidong wang 0.003959569756962626
ties boerma 0.003104916131566342
zulfiqar a bhutta 0.002976106231424614
simon i hay 0.002917105449027309
jennifer bryce 0.0027605209544109338
alan d lopez 0.0023323971772294264
mickey chopra 0.0021597209331496005
martin mckee 0.0021496201786499725
lale say 0.002013575122614477
li liu 0.0017283642184230793
peter s hill 0.0017159210904241315
robert w snow 0.0016493019876191327
andrew j tatem 0.001588356712224441
colin d mathers 0.0015479434312387122
jamie perin 0.0013953046600908043
dina balabanova 0.0013011107630773679
christopher j l murray 0.0010631415286625645
bernadette daelmans 0.0009142211919973025
abdisalan m noor 0.0009087460249743345
josephine borghi 0.0008801631632582194
sonja firth 0.0008760686486322196
peter waiswa 0.0008759111673004503
robert cli

### Output graph data as separate files containing `nodes` and `edges` for display in `flourish.studio`.

Our platform for visualizing the graph, `flourish.studio` requires edge and node lists as comma-separated rows

In [14]:
import csv

nx.write_edgelist(subview, "mag_output_authors/mag_edges.tsv", delimiter="\t", data=["weight"])
with open("mag_output_authors/mag_nodes.csv", "w") as f:
    fieldnames = ["id", "affiliation", "papers", "name", "count"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for n, data in list(subview.nodes(data=True)):
        row = data
        row["id"] = n
        writer.writerow(row)

In [15]:
max([subview[e][v]["weight"] for e, v in subview.edges])

14