In [20]:
# import necessary stuff
import pickle
import networkx as nx

In [21]:
dataset_path = 'data/coauthor/'
coauthor_crawled_data_file_path = dataset_path + 'coauthor_json.p'
EXPORT_AS_EDGE_LIST = True

In [22]:
with open(coauthor_crawled_data_file_path, 'rb') as pickle_file:
    coauthor_data = pickle.load(pickle_file)

In [23]:
# define research fields and years of interest for us
fields_of_studies = ['Machine learning', 'Data mining']
years = [2013,2014,2015,2016]

In [24]:
# extract top 5 conferences per field of research
top_5_conf_series_per_field = {}
for field_of_study in fields_of_studies:
    top_5_conf_series_per_field[field_of_study] = coauthor_data[field_of_study]

In [25]:
# define networkx graph
coauthor_graph = nx.Graph()

In [26]:
# define node and edge label constants
AUTHOR = 'author'
PAPER = 'paper'
CO_AUTHOR = 'co_author_of'
REFERENCES = 'references'
WRITTEN_BY = 'written_by'

In [27]:
# add authors and papers
for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        for year in coauthor_data[field_of_study][conference].keys():
            for i, paper in enumerate(coauthor_data[field_of_study][conference][year]):
                coauthor_graph.add_node('P' + str(paper['Id']), num_citations=paper['CC'], num_references=len(paper['RId']),
                                        conference=conference, field_of_study=field_of_study, label=PAPER)
                for author in coauthor_data[field_of_study][conference][year][i]['authors']:
                    coauthor_graph.add_node('A' + str(author), label=AUTHOR)

print("{} author and paper nodes in graph".format(coauthor_graph.number_of_nodes()))

30896 author and paper nodes in graph


In [28]:
# add co-author, written_by and reference edge
for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        for year in coauthor_data[field_of_study][conference].keys():
            for i, paper in enumerate(coauthor_data[field_of_study][conference][year]):
                for referenced_paper_id in paper['RId']:
                    if 'P' + str(referenced_paper_id) in coauthor_graph:
                        coauthor_graph.add_edge('P' + str(paper['Id']), 'P' + str(referenced_paper_id),
                                                label=REFERENCES)
                for author in coauthor_data[field_of_study][conference][year][i]['authors']:
                    coauthor_graph.add_edge('P' + str(paper['Id']), 'A' + str(author), label=WRITTEN_BY)
                    for co_author in coauthor_data[field_of_study][conference][year][i]['authors']:
                        if author != co_author:
                            coauthor_graph.add_edge('A' + str(author), 'A' + str(co_author), label=CO_AUTHOR)

print("{} nodes in graph".format(coauthor_graph.number_of_nodes()))
print("{} edges in graph".format(coauthor_graph.number_of_edges()))

30896 nodes in graph
99578 edges in graph


In [29]:
# export graph as edge list to given path
if EXPORT_AS_EDGE_LIST:
    edge_list_export_path = dataset_path + 'coauthor_edgelist.csv'
    nx.write_edgelist(coauthor_graph, edge_list_export_path, data=False)