In [1]:
# import necessary stuff
import pickle
import networkx as nx
import numpy as np

In [2]:
dataset_path = 'data/coauthor/'
coauthor_crawled_data_file_path = dataset_path + 'coauthor_crawled_data.p'
EXPORT_AS_EDGE_LIST = False

In [3]:
with open(coauthor_crawled_data_file_path, 'rb') as pickle_file:
    coauthor_data = pickle.load(pickle_file)

In [4]:
# define research fields and years of interest for us
fields_of_studies = ['Machine learning']
years = [2013,2014,2015,2016]

In [5]:
# extract top 5 conferences per field of research
top_5_conf_series_per_field = {}
for field_of_study in fields_of_studies:
    top_5_conf_series_per_field[field_of_study] = coauthor_data[field_of_study]

In [6]:
# define networkx graph
coauthor_graph = nx.Graph()

In [7]:
# define node and edge label constants
AUTHOR = 'author'
PAPER = 'paper'
CO_AUTHOR = 'co_author_of'
REFERENCES = 'references'
WRITTEN_BY = 'written_by'

In [8]:
# add authors and papers
already_added_papers = []
already_added_authors = []

for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        for year in coauthor_data[field_of_study][conference].keys():
            for i, paper in enumerate(coauthor_data[field_of_study][conference][year]):
                if 'P' + str(paper['Id']) not in already_added_papers:
                    coauthor_graph.add_node('P' + str(paper['Id']), num_citations=paper['CC'], num_references=len(paper['RId']),
                                        conference=conference, field_of_study=field_of_study, label=PAPER)
                    already_added_papers.append('P' + str(paper['Id']))
                for author in coauthor_data[field_of_study][conference][year][i]['authors']:
                    if 'A' + str(author) not in already_added_authors:
                        coauthor_graph.add_node('A' + str(author), label=AUTHOR)
                        already_added_authors.append('A' + str(author))

print("{} author and paper nodes in graph".format(coauthor_graph.number_of_nodes()))

30896 author and paper nodes in graph


In [9]:
# add co-author, written_by and reference edge
for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        for year in coauthor_data[field_of_study][conference].keys():
            for i, paper in enumerate(coauthor_data[field_of_study][conference][year]):
                for referenced_paper_id in paper['RId']:
                    if 'P' + str(referenced_paper_id) in coauthor_graph:
                        coauthor_graph.add_edge('P' + str(paper['Id']), 'P' + str(referenced_paper_id),
                                                label=REFERENCES)
                for author in coauthor_data[field_of_study][conference][year][i]['authors']:
                    coauthor_graph.add_edge('P' + str(paper['Id']), 'A' + str(author), label=WRITTEN_BY)
                    for co_author in coauthor_data[field_of_study][conference][year][i]['authors']:
                        if author != co_author:
                            coauthor_graph.add_edge('A' + str(author), 'A' + str(co_author), label=CO_AUTHOR)

print("{} nodes in graph".format(coauthor_graph.number_of_nodes()))
print("{} edges in graph".format(coauthor_graph.number_of_edges()))

30896 nodes in graph
99578 edges in graph


In [10]:
# export graph as edge list to given path
if EXPORT_AS_EDGE_LIST:
    edge_list_export_path = dataset_path + 'coauthor_edgelist.csv'
    nx.write_edgelist(coauthor_graph, edge_list_export_path, data=False)

In [11]:
# compute average degree of all nodes in graph
node_degrees = np.array(list(dict(coauthor_graph.degree(list(coauthor_graph.nodes))).values()),dtype=np.int64)
avg_node_degree = np.mean(node_degrees)
print("The avg. node degree is {}".format(np.round(avg_node_degree, decimals=2)))

The avg. node degree is 6.45


In [12]:
len(set(list(coauthor_graph.nodes)))

30896

In [13]:
# collect conference class label mapping
conference_count = 0
conference_label_mapping = {}
for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        conference_label_mapping[conference] = conference_count
        conference_count += 1

In [14]:
conference_label_mapping

{1121227772: 4, 1127325140: 3, 1158167855: 0, 1163902177: 1, 1170695740: 2}

In [17]:
with open(dataset_path + 'coauthor_networkx.p', 'wb') as pickle_file:
    pickle.dump(coauthor_graph, pickle_file)