In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import re

In [None]:
# Jaccard Distance for edge (x,y)
def JD(G,x,y):
    neighs_x = set(G.neighbors(x))
    neighs_y = set(G.neighbors(y))
    if len(neighs_x) == 0 or len(neighs_y) == 0:
        return 0
    a = len(neighs_x.intersection(neighs_y))
    b = len(neighs_x.union(neighs_y))
    return a/b

# Create a page rank file from graph with pickle
def PR_file(G, path, a):
    pr = nx.pagerank(G, alpha=a)
    pickle.dump(pr,open(path, 'wb'))
    
# Return Adamic/Adar Index for edge (x,y)
def AdamicAdarIndex(G,x,y):
    neighs_x = set(G.neighbors(x))
    neighs_y = set(G.neighbors(y))
    inter = neighs_x.intersect(neighs_y)
    sum = 0
    if len(inter > 0):
        for com in inter:
            deg = G.degree[com]
            sum = sum + (1/log10(deg))
    return sum

# Create a HITS file from graph with pickle
def HITS(G, path):
    hits = nx.hits(G, max_iter=100, tol=1e-08, nstart=None, normalized=True)
    pickle.dump(hits,open(path, 'wb'))

# Create a category file from '/data/categories' to dictionary with pickle, takes tsv as input
def categories(in_file, out_file):
    with open(in_file) as fp:
        line = fp.readline()
        categories = []
        cat_dict = dict()
        while line:
            
            splitted = re.split(r'\t+', line)
            
            # article
            x = splitted[0]
            cat_dict[x] = set()
            
            # categories of article x
            cat = splitted[1]
            
            # Take main category
            x_cats = cat.split('.')
            
            for cat_i in x_cats:
                x_cat = cat_i.split('\n')[0]
                
                if x_cat in categories:
                    temp = cat_dict[x]
                    temp.add(categories.index(x_cat))
                    cat_dict[x] = temp
                else:
                    temp = cat_dict[x]
                    temp.add(len(categories))
                    cat_dict[x] = temp
                    categories.append(x_cat)
            line = fp.readline()
        pickle.dump(cat_dict,open(out_file, 'wb'))
        pickle.dump(categories,open('../data/numberToCategoryNameList.pkl', 'wb'))

#categories('../data/categoriesDecoded.tsv', '../data/getCategoryFromLinkDict.pkl')

In [None]:
#g = nx.read_edgelist("data/decoded.tsv")

#print(nx.info(g))
pd.read_csv('data/decoded.tsv',nrows=80, delimiter='\t').to_csv('data/decodedSample.tsv', sep='\t',header=False,index=False)

#reading the edgelist in a variable using networkX
subgraph=nx.read_edgelist('data/decodedSample.tsv',delimiter='\t',create_using=nx.DiGraph())
#plotting the graph
print(nx.info(subgraph))
pos=nx.spring_layout(subgraph)
nx.draw(subgraph,pos,node_color='#A0CBE2',edge_color='#00bb5e',width=1,edge_cmap=plt.cm.Blues,with_labels=True)
plt.savefig("graph_sample.pdf")