# Co-Author Network from Microsoft Academic Graph

Requires an actual path to be included as `path_to_data` in the first cell below. This csv must contain columns named as referenced below.

### Extracting data from CSV

This first cell will fill out several dictionaries with data from the csv file:
* **data_by_id**: article information keyed to article id.
* **data_by_author**: author data keyed to author id

In [30]:
import os
import csv

data_by_id = {}
data_by_author = {}

path_to_data = "sourcedata/mag_sdg.csv"

# Read in source data and iterate row by row.
with open(path_to_data) as datafile:
    reader = csv.DictReader(datafile) 
    # Each row contains one article author; thus each article will have as many rows as authors.
    for row in reader:
        year = int(row["year"])
        # Compile data only if year is in range 1999 to 2018.
        if year < 1999 or year > 2018:
            continue

        id_ = row["paper_id"]
        author_id = row["author_id"]
        author_name = row["author_name"]

        # Store data about each author for lookup by name.
        if author_id not in data_by_author:
            data_by_author[author_id] = {
                "author_name": [row["author_name"]],
                "affiliation_id": [row["affiliation_id"]],
                "last_known_affiliation_id": [row["last_known_affiliation_id"]],
                "affiliation_name": [row["affiliation_name"]],
                "doi": [row["doi"]] if row["doi"] != "" else [],
                "paper_id": [row["paper_id"]]
            }
        
        # If author is already in the dict, append additional affiliation and article info.
        else:
            for key in ["author_name", "affiliation_id", "paper_id", 
                        "last_known_affiliation_id", "doi", "affiliation_name"]:
                if row[key] not in data_by_author[author_id][key]:
                    data_by_author[author_id][key].append(row[key])
        
        # Store data about each article for lookup by article ID.
        if id_ not in data_by_id:
            data_by_id[id_] = {
                "doi": row["doi"],
                "doc_type": row["doc_type"],
                "title": row["original_title"],
                "year": row["year"],
                "authors": [
                    {
                        "author_id": row["author_id"],
                        "name": row["author_name"],
                        "last_known_affiliation_id": [row["last_known_affiliation_id"]],
                        "affiliation_id": [row["affiliation_id"]],
                        "affiliation_name": [row["affiliation_name"]],
                    }
                ],
                "all_author_ids": [row["author_id"]]
            }
        # If article id already in dict, append additional author/affiliation data.
        else:
            if author_id in data_by_id[id_]["all_author_ids"]:
                for a in data_by_id[id_]["authors"]:
                    if a["author_id"] == author_id:
                        affiliation_id = row["affiliation_id"]
                        last_known = row["last_known_affiliation_id"]
                        normalized = row["affiliation_name"]
                        if affiliation_id not in a["affiliation_id"]:
                            a["affiliation_id"].append(affiliation_id)
                            a["affiliation_name"].append(normalized)
                            
                        if last_known not in a["last_known_affiliation_id"]:
                            a["last_known_affiliation_id"].append(last_known)
                            
            else:
                new_author = {
                    "author_id": row["author_id"],
                    "name": row["author_name"],
                    "last_known_affiliation_id": [row["last_known_affiliation_id"]],
                    "affiliation_id": [row["affiliation_id"]],
                    "affiliation_name": [row["affiliation_name"]],
                    }
                data_by_id[id_]["authors"].append(new_author)
                data_by_id[id_]["all_author_ids"].append(author_id)           

#### Check values for accuracy / highlight obvious issues

In [31]:
len(data_by_author)

35333

In [32]:
list(data_by_author.keys())[0]

'2020620411'

In [33]:
data_by_author['2020620411']

{'author_name': ['e ngwenya'],
 'affiliation_id': ['129801699'],
 'last_known_affiliation_id': ['129801699'],
 'affiliation_name': ['university of tasmania'],
 'doi': ['10.1007/978-1-4899-7439-6_4', '10.1007/978-1-4899-7439-6_3'],
 'paper_id': ['420720', '1560432272']}

Many authors have multiple institutional affiliations:

In [34]:
count = 0
for author, data in data_by_author.items():
    if len(data["affiliation_name"]) > 1:
        count += 1
        # print(author, data)
print(count, "authors with multiple affiliations")

1834 authors with multiple affiliations


Compare total number of rows to number of unique rows to roughly estimate authors per paper.

In [35]:
ids = []

with open(path_to_data) as datafile:
    
    reader = csv.DictReader(datafile) 
    for row in reader:
        id_ = row["paper_id"]
        ids.append(id_)
print(len(set(ids)), "total papers,", len(ids), "total authors")

16699 total papers, 44373 total authors


### Building graph from author and article data

First, create the complete graph of all co-authorship.

In [36]:
from itertools import combinations
import networkx as nx

G = nx.Graph()

for id_, data in data_by_id.items():
    
    authors = data["all_author_ids"]
    """
    Iterate over all combinations of authors per paper.
    For example if authors are 456, 789, and 123, iterate over 3 co-authorship combinations:
        456 789
        456 123
        789 123
    Add nodes and edges as needed for each combo.  
    """
    for i, j in combinations(authors, 2):
        year = data["year"]
        if G.has_edge(i, j):
            G[i][j]["weight"] += 1
            if id_ not in G[i][j]["articles"]:
                G[i][j]["articles"].append(id_)
            
            if year not in G[i][j]["years"]:
                G[i][j]["years"].append(year)
        else:
            G.add_edge(i, j, weight=1, articles=[id_], years=[data["year"]])

# Go back and iterate over all nodes created, adding affiliation/count/author data.
for author, data in G.nodes(data=True):
    data["affiliation"] = data_by_author[author]["affiliation_name"][0]
    data["papers"] = "<br>\n".join(data_by_author[author]["doi"])
    data["name"] = ", ".join(data_by_author[author]["author_name"])    
    data["count"] = len(data_by_author[author]["paper_id"])

In [37]:
len(G.edges)

79510

In [38]:
len(G.nodes)

29527

Create subgraph containing only those edges with a `weight` greater than 1, that is, co-author pairs who published more than 1 paper in the dataset.

In [39]:
def filter_edge(n1, n2):
    """Check if weight is larger than 1."""
    return G[n1][n2]["weight"] > 1

def filter_node(n):
    """Filter out unconnected nodes."""
    return not nx.is_isolate(view, n)

view = nx.subgraph_view(G, filter_edge=filter_edge)
subview = nx.subgraph_view(view, filter_node=filter_node)

In [40]:
len(subview.edges()), len(subview.nodes())

(2847, 1913)

Centrality Measures

In [41]:
from operator import itemgetter

cc = nx.closeness_centrality(subview)
bc = nx.betweenness_centrality(subview)

bc_sorted = sorted(bc.items(), key=itemgetter(1), reverse=True)
cc_sorted = sorted(cc.items(), key=itemgetter(1), reverse=True)

for name, value in bc_sorted[:50]:
    print(name, value)

1968854487 0.011934865232997301
2115856384 0.00591926503462666
2304922534 0.005856917954631874
2107543259 0.005527923769320745
2132072248 0.00540802094896536
2356932058 0.005244557645621993
2126383504 0.004198606883730748
2168746073 0.004088366886050457
2316804223 0.003653152340397471
2678471448 0.0034797144541415335
2128434403 0.0034245475754687447
2159017802 0.003057611844222723
2015186656 0.0028688250595112967
2287806044 0.002843042591996567
2250989391 0.0022606403359541434
2140640685 0.001806692913344774
2294583727 0.001671669633415001
1990686718 0.0013695210945659242
2604652503 0.0013689253538644774
2106159574 0.0012439745789659185
2138773381 0.0012248716391949851
2231847126 0.0011244897980724042
2148695815 0.0010985479145755807
2287431464 0.0009931285819099948
1881159122 0.0009700730938162741
1923876551 0.0009277930676615673
2159611613 0.0009230491896361591
299576312 0.0008464404990867873
1975427381 0.0007767187982370289
1501559239 0.0007717924633644897
2272151263 0.0006579439482

### Output graph data as separate files containing `nodes` and `edges` for display in `flourish.studio`.

Our platform for visualizing the graph, `flourish.studio` requires edge and node lists as comma-separated rows

In [42]:
import csv

nx.write_edgelist(subview, "mag_output_authors/mag_edges.tsv", delimiter="\t", data=["weight"])
with open("mag_output_authors/mag_nodes.csv", "w") as f:
    fieldnames = ["id", "affiliation", "papers", "name", "count"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for n, data in list(subview.nodes(data=True)):
        row = data
        row["id"] = n
        writer.writerow(row)

In [43]:
max([subview[e][v]["weight"] for e, v in subview.edges])

14