In [1]:
# import necessary stuff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import time
import pickle
import pprint
import chardet
from telegram import Bot
from multiprocessing import Pool, cpu_count
from heapq import nlargest

In [2]:
# initialize pretty printer
pp = pprint.PrettyPrinter(indent=4, depth=8)

In [3]:
# initilize telegram bot
token = "350553078:AAEu70JDqMFcG_x5eBD3nqccTvc4aFNMKkg"
chat_id = "126551968"
bot = Bot(token)

In [4]:
# define dataset file paths
dataset_path = 'data/net_dbis/'
authors_csv_path = dataset_path + 'id_author.txt'
conferences_csv_path = dataset_path + 'id_conf.txt'
papers_csv_path = dataset_path + 'paper.txt'
paper_author_edges_csv_path = dataset_path + 'paper_author.txt'
paper_conference_edges_csv_path = dataset_path + 'paper_conf.txt'

In [5]:
#detect encodings of files
encodings = {}
file_paths = [authors_csv_path, conferences_csv_path, papers_csv_path, paper_author_edges_csv_path, paper_conference_edges_csv_path]

for file_path in file_paths:
    with open(file_path, 'rb') as f:
        encodings[file_path] = (chardet.detect(f.read()))

In [6]:
# store cvs contents in dataframe
authors_df = pd.read_csv(authors_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[authors_csv_path]["encoding"])
conferences_df = pd.read_csv(conferences_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[conferences_csv_path]["encoding"])
papers_df = pd.read_csv(papers_csv_path, sep='     ', header=None, dtype={0:str, 1:str}, encoding=encodings[papers_csv_path]["encoding"])
paper_author_edges_df = pd.read_csv(paper_author_edges_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[paper_author_edges_csv_path]["encoding"])
paper_conference_edges_df = pd.read_csv(paper_conference_edges_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[paper_conference_edges_csv_path]["encoding"])

  after removing the cwd from sys.path.


In [7]:
# give authors, papers and conferences unique node-ids
authors_df[0] = 'a' + authors_df[0]
conferences_df[0] = 'c' + conferences_df[0]
papers_df[0] = 'p' + papers_df[0]
paper_author_edges_df[0] = 'p' + paper_author_edges_df[0]
paper_author_edges_df[1] = 'a' + paper_author_edges_df[1]
paper_conference_edges_df[0] = 'p' + paper_conference_edges_df[0]
paper_conference_edges_df[1] = 'c' + paper_conference_edges_df[1]

In [8]:
# define networkx graph
dbis_graph = nx.Graph()

In [9]:
# define node and edge label constants
AUTHOR = 'author'
PAPER = 'paper'
CONFERENCE = 'conference'
PUBLISHED_AT = 'published_at'
WRITTEN_BY = 'written_by' 

In [10]:
# add author, paper and conference nodes to graph
dbis_graph.add_nodes_from(authors_df[0].tolist(), label=AUTHOR)
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))
dbis_graph.add_nodes_from(papers_df[0].tolist(), label=PAPER)
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))
dbis_graph.add_nodes_from(conferences_df[0].tolist(), label=CONFERENCE)
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

60694 nodes in graph
133596 nodes in graph
134060 nodes in graph


In [11]:
# create edge tuples from dataframe
paper_author_edges = list(zip(paper_author_edges_df[0].tolist(), paper_author_edges_df[1].tolist()))
paper_conference_edges = list(zip(paper_conference_edges_df[0].tolist(), paper_conference_edges_df[1].tolist()))

In [12]:
# add (paper)-[published_at]-(conference) edges to graph
dbis_graph.add_edges_from(paper_conference_edges, label=PUBLISHED_AT)
print("{} edges in graph".format(dbis_graph.number_of_edges()))
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

72902 edges in graph
134060 nodes in graph


In [13]:
# add (paper)-[written_by]-(author) edges to graph
dbis_graph.add_edges_from(paper_author_edges, label=WRITTEN_BY)
print("{} edges in graph".format(dbis_graph.number_of_edges()))
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

265317 edges in graph
134060 nodes in graph


In [14]:
# extract top-5000 authors with regard to number of publications
# add each author with less than 8 papers to the delete candidates
num_of_top_k_authors = 5000
author_degrees = []
for node in list(dbis_graph.nodes):
    if dbis_graph.nodes[node]['label'] == AUTHOR:
        author_degrees.append(dbis_graph.degree(node))

top_k_author_degree_threshold = min(nlargest(num_of_top_k_authors, author_degrees))
delete_candidates = []

for node in list(dbis_graph.nodes):
    if dbis_graph.nodes[node]['label'] == AUTHOR:
        if dbis_graph.degree(node) <= top_k_author_degree_threshold:
            delete_candidates.append(node)

print("{} authors with less than {} papers are delete candidates".format(len(delete_candidates),top_k_author_degree_threshold+1))

56163 authors with less than 8 papers are delete candidates


In [15]:
# remove author delete candidates from graph
dbis_graph.remove_nodes_from(delete_candidates)
print("{} edges in graph".format(dbis_graph.number_of_edges()))
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

167795 edges in graph
77897 nodes in graph


In [20]:
# read conference labels from file
conference_labels_file_path = dataset_path + 'googlescholar_conference_labels.txt'
conference_labels_df = pd.read_csv(conference_labels_file_path, sep=' ', header=None, dtype={0:str, 1:int})

Unnamed: 0,0,1
0,c4789,vVLDBJ.
1,c3258,"vISWCWorkshoponTrust,Security,andReputationont..."
2,c4149,vIWRIDL
3,c3252,vSWDB
4,c3257,vSWSWPC
5,c3850,vHIKM
6,c3996,vWebMine
7,c1799,vICDEWorkshops
8,c1798,vICDE
9,c2914,vORM


In [42]:
pd.merge(conference_labels_df, conferences_df, how='inner', left_on=0, right_on=1, left_index=False,
         right_index=False, copy=True, indicator=False)

Unnamed: 0,key_0,0_x,1_x,0_y,1_y
0,vWWW,vWWW,8,c3771,vWWW
1,vWWW,vWWW,8,c3777,vWWW
2,vWWW,vWWW,8,c3774,vWWW
3,vWWW,vWWW,8,c3779,vWWW
4,vWWW,vWWW,8,c3775,vWWW
5,vVLDB,vVLDB,8,c3594,vVLDB
6,vIEEETrans.Knowl.DataEng.,vIEEETrans.Knowl.DataEng.,8,c4751,vIEEETrans.Knowl.DataEng.
7,vSIGMODConference,vSIGMODConference,8,c3329,vSIGMODConference
8,vWSDM,vWSDM,8,c4096,vWSDM
9,vICDE,vICDE,8,c1798,vICDE
