In [14]:
# import necessary stuff and python-wrapper of verse
import os
import pprint
import numpy as np
import json
import sys
import pickle
import codecs
import networkx as nx
from scipy.sparse import csr_matrix

from verse.python.wrapper import VERSE
from multi_class_classification import MultiClassClassification
from multi_label_classification import MultiLabelClassification
from clustering import Clustering
from link_prediction import LinkPrediction
from experiment import Experiment

In [15]:
# initialize pretty printer
pp = pprint.PrettyPrinter(indent=4, depth=8)

In [16]:
# configure telegram notifier bot
my_telegram_config = {
    "telegram": {
        "token": "350553078:AAEu70JDqMFcG_x5eBD3nqccTvc4aFNMKkg",
        "chat_id": "126551968",
        "verbose": 1
    }
}

In [17]:
dataset_path = 'data/coauthor/'
coauthor_crawled_data_file_path = dataset_path + 'coauthor_json.p'
EXPORT_AS_EDGE_LIST = False

In [18]:
with open(coauthor_crawled_data_file_path, 'rb') as pickle_file:
    coauthor_data = pickle.load(pickle_file)

In [19]:
# define research fields and years of interest for us
fields_of_studies = ['Machine learning', 'Data mining']
years = [2013,2014,2015,2016]

In [20]:
# extract top 5 conferences per field of research
top_5_conf_series_per_field = {}
for field_of_study in fields_of_studies:
    top_5_conf_series_per_field[field_of_study] = coauthor_data[field_of_study]

In [21]:
# define networkx graph
coauthor_graph = nx.Graph()

In [22]:
# define node and edge label constants
AUTHOR = 'author'
PAPER = 'paper'
CO_AUTHOR = 'co_author_of'
REFERENCES = 'references'
WRITTEN_BY = 'written_by'

In [23]:
# add authors and papers
for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        for year in coauthor_data[field_of_study][conference].keys():
            for i, paper in enumerate(coauthor_data[field_of_study][conference][year]):
                coauthor_graph.add_node('P' + str(paper['Id']), num_citations=paper['CC'], num_references=len(paper['RId']),
                                        conference=conference, field_of_study=field_of_study, label=PAPER)
                for author in coauthor_data[field_of_study][conference][year][i]['authors']:
                    coauthor_graph.add_node('A' + str(author), label=AUTHOR)

print("{} author and paper nodes in graph".format(coauthor_graph.number_of_nodes()))

30896 author and paper nodes in graph


In [24]:
# add co-author, written_by and reference edge
for field_of_study in coauthor_data.keys():
    for conference in coauthor_data[field_of_study].keys():
        for year in coauthor_data[field_of_study][conference].keys():
            for i, paper in enumerate(coauthor_data[field_of_study][conference][year]):
                for referenced_paper_id in paper['RId']:
                    if 'P' + str(referenced_paper_id) in coauthor_graph:
                        coauthor_graph.add_edge('P' + str(paper['Id']), 'P' + str(referenced_paper_id),
                                                label=REFERENCES)
                for author in coauthor_data[field_of_study][conference][year][i]['authors']:
                    coauthor_graph.add_edge('P' + str(paper['Id']), 'A' + str(author), label=WRITTEN_BY)
                    for co_author in coauthor_data[field_of_study][conference][year][i]['authors']:
                        if author != co_author:
                            coauthor_graph.add_edge('A' + str(author), 'A' + str(co_author), label=CO_AUTHOR)

print("{} nodes in graph".format(coauthor_graph.number_of_nodes()))
print("{} edges in graph".format(coauthor_graph.number_of_edges()))

30896 nodes in graph
99578 edges in graph


In [25]:
# compute average degree of all nodes in graph
node_degrees = np.array(list(dict(coauthor_graph.degree(list(coauthor_graph.nodes))).values()),dtype=np.int64)
avg_node_degree = np.mean(node_degrees)
print("The avg. node degree is {}".format(np.round(avg_node_degree, decimals=2)))

The avg. node degree is 6.45
