In [20]:
# import necessary stuff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import time
import pickle
import pprint
import chardet
import warnings
import urllib.request
from telegram import Bot
from multiprocessing import Pool, cpu_count
from heapq import nlargest
from bs4 import BeautifulSoup
from socket import timeout

In [2]:
# ignore warnings
warnings.simplefilter("ignore")

In [3]:
# initialize pretty printer
pp = pprint.PrettyPrinter(indent=4, depth=8)

In [4]:
# initilize telegram bot
token = "350553078:AAEu70JDqMFcG_x5eBD3nqccTvc4aFNMKkg"
chat_id = "126551968"
bot = Bot(token)

In [5]:
# define dataset file paths
dataset_path = 'data/net_dbis/'
authors_csv_path = dataset_path + 'id_author.txt'
conferences_csv_path = dataset_path + 'id_conf.txt'
papers_csv_path = dataset_path + 'paper.txt'
paper_author_edges_csv_path = dataset_path + 'paper_author.txt'
paper_conference_edges_csv_path = dataset_path + 'paper_conf.txt'

In [6]:
#detect encodings of files
encodings = {}
file_paths = [authors_csv_path, conferences_csv_path, papers_csv_path, paper_author_edges_csv_path, paper_conference_edges_csv_path]

for file_path in file_paths:
    with open(file_path, 'rb') as f:
        encodings[file_path] = (chardet.detect(f.read()))

In [7]:
# store cvs contents in dataframe
authors_df = pd.read_csv(authors_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[authors_csv_path]["encoding"])
conferences_df = pd.read_csv(conferences_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[conferences_csv_path]["encoding"])
papers_df = pd.read_csv(papers_csv_path, sep='     ', header=None, dtype={0:str, 1:str}, encoding=encodings[papers_csv_path]["encoding"])
paper_author_edges_df = pd.read_csv(paper_author_edges_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[paper_author_edges_csv_path]["encoding"])
paper_conference_edges_df = pd.read_csv(paper_conference_edges_csv_path, sep='\t', header=None, dtype={0:str, 1:str}, encoding=encodings[paper_conference_edges_csv_path]["encoding"])

In [8]:
# give authors, papers and conferences unique node-ids
authors_df[0] = 'a' + authors_df[0]
conferences_df[0] = 'c' + conferences_df[0]
papers_df[0] = 'p' + papers_df[0]
paper_author_edges_df[0] = 'p' + paper_author_edges_df[0]
paper_author_edges_df[1] = 'a' + paper_author_edges_df[1]
paper_conference_edges_df[0] = 'p' + paper_conference_edges_df[0]
paper_conference_edges_df[1] = 'c' + paper_conference_edges_df[1]

In [9]:
# define networkx graph
dbis_graph = nx.Graph()

In [10]:
# define node and edge label constants
AUTHOR = 'author'
PAPER = 'paper'
CONFERENCE = 'conference'
PUBLISHED_AT = 'published_at'
WRITTEN_BY = 'written_by' 

In [11]:
# add author, paper and conference nodes to graph
dbis_graph.add_nodes_from(authors_df[0].tolist(), label=AUTHOR)
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))
dbis_graph.add_nodes_from(papers_df[0].tolist(), label=PAPER)
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))
dbis_graph.add_nodes_from(conferences_df[0].tolist(), label=CONFERENCE)
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

60694 nodes in graph
133596 nodes in graph
134060 nodes in graph


In [12]:
# create edge tuples from dataframe
paper_author_edges = list(zip(paper_author_edges_df[0].tolist(), paper_author_edges_df[1].tolist()))
paper_conference_edges = list(zip(paper_conference_edges_df[0].tolist(), paper_conference_edges_df[1].tolist()))

In [13]:
# add (paper)-[published_at]-(conference) edges to graph
dbis_graph.add_edges_from(paper_conference_edges, label=PUBLISHED_AT)
print("{} edges in graph".format(dbis_graph.number_of_edges()))
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

72902 edges in graph
134060 nodes in graph


In [14]:
# add (paper)-[written_by]-(author) edges to graph
dbis_graph.add_edges_from(paper_author_edges, label=WRITTEN_BY)
print("{} edges in graph".format(dbis_graph.number_of_edges()))
print("{} nodes in graph".format(dbis_graph.number_of_nodes()))

265317 edges in graph
134060 nodes in graph


In [23]:
def crawl_scholar_paper(paper_index):
    url = "http://citeseerx.ist.psu.edu/search?q="
    q = "title%3A%28{}%29&submit=Search&sort=cite&t=doc"
    title = papers_df.loc[paper_index, 1].strip()[:-1]
    url += q.format(title.replace (" ", "+"))
    
    cited_by = -1
    try:
        soup = BeautifulSoup(urllib.request.urlopen(url, timeout=10).read())
    except timeout:
        cited_by = -2
    
    if soup.html.body("div", id = "result_list")[0].find("div", class_="error") is None:
        result = soup.html.body("div", id = "result_list")[0].find("div", class_="pubextras").find("a", class_="citation").string
        cited_by = int(result.split("Cited by ")[-1].split(" (")[0])
    
    return cited_by

In [None]:
papers_citations_count = {}
start_time = time.time()

with Pool(cpu_count() * 40) as pool:
    for i, result in enumerate(pool.imap(crawl_scholar_paper, list(range(500)), chunksize=1)):
        if i % 10 == 0:
            print("Already crawled {} papers".format(i+1))
        papers_citations_count[papers_df.loc[i,1].strip()] = result

end_time = time.time()
crawl_duration = round(end_time - start_time,2)
print("Crawling took {} sec.".format(crawl_duration))

Already crawled 1 papers
Already crawled 11 papers
Already crawled 21 papers
