In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import re
import math
import random

In [None]:
# Jaccard Distance for edge (x,y)
def JD(G,x,y):
    neighs_x = set(G.neighbors(x))
    neighs_y = set(G.neighbors(y))
    if len(neighs_x) == 0 or len(neighs_y) == 0:
        return 0
    a = len(neighs_x.intersection(neighs_y))
    b = len(neighs_x.union(neighs_y))
    return a/b

# Create a page rank file from graph with pickle
def PR_file(G, path, a = 0.85):
    pr = nx.pagerank(G, alpha=a)
    pickle.dump(pr,open(path, 'wb'))
    
# Return Adamic/Adar Index for edge (x,y)
def AdamicAdarIndex(G,x,y):
    neighs_x = set(G.neighbors(x))
    neighs_y = set(G.neighbors(y))
    inter = neighs_x.intersection(neighs_y)
    sum = 0
    if len(inter) > 0:
        for com in inter:
            deg = G.degree[com]
            sum = sum + (1/math.log(deg))
    return sum

# Create a HITS file from graph with pickle
def HITS(G, path):
    hits = nx.hits(G, max_iter=500, tol=1e-08, nstart=None, normalized=True)
    pickle.dump(hits,open(path, 'wb'))

# Create a category file from '/data/categories' to dictionary with pickle, takes tsv as input
def categories(in_file, out_file):
    with open(in_file) as fp:
        line = fp.readline()
        categories = []
        cat_dict = dict()
        while line:
            
            splitted = re.split(r'\t+', line)
            
            # article
            x = splitted[0]
            cat_dict[x] = set()
            
            # categories of article x
            cat = splitted[1]
            
            # Take main category
            x_cats = cat.split('.')
            
            for cat_i in x_cats:
                x_cat = cat_i.split('\n')[0]
                
                if x_cat in categories:
                    temp = cat_dict[x]
                    temp.add(categories.index(x_cat))
                    cat_dict[x] = temp
                else:
                    temp = cat_dict[x]
                    temp.add(len(categories))
                    cat_dict[x] = temp
                    categories.append(x_cat)
            line = fp.readline()
        pickle.dump(cat_dict,open(out_file, 'wb'))
        pickle.dump(categories,open('../data/numberToCategoryNameList.pkl', 'wb'))

#categories('../data/categoriesDecoded.tsv', '../data/getCategoryFromLinkDict.pkl')

# Create X vector (tuple) for edge (x,y) in graph G
def create_X(G,x,y, categories_dict, pr, hits):
    jd = JD(G,x,y)
    aa = AdamicAdarIndex(G,x,y)
    
    cats_x = set()
    cats_y = set()
    try:
        cats_x = categories_dict[x]
        cats_y = categories_dict[y]
    except:
        pass
    no_comm = len(cats_x.intersection(cats_y))
    
    pr_x = pr[x]
    pr_y = pr[y]
    
    hits_x = hits[0][x]
    hits_y = hits[0][y]
    
    return no_comm, jd, aa, pr_x, pr_y, hits_x, hits_y


In [None]:
# Load original network
G = nx.read_edgelist('../data/decoded.tsv', delimiter = '\t', create_using=nx.DiGraph())

In [None]:
# Create HITS and PR
HITS(G, '../data/hitsOrig.pkl')
PR_file(G, '../data/PROrig.pkl')

In [None]:
hits = []
with open('../data/hitsOrig.pkl', 'rb') as f:
    hits = pickle.load(f)
    
pr = []
with open('../data/PROrig.pkl', 'rb') as f:
    pr = pickle.load(f)
    
cat_dict = []
with open('../data/getCategoryFromLinkDict.pkl', 'rb') as f:
    cat_dict = pickle.load(f)


In [None]:
# Loop over edges and create XY data for them
vecs = []
ys = []
for e in G.edges:
    x = e[0]
    y = e[1]
    
    vec = create_X(G,x,y, cat_dict, pr, hits)
    vecs.append(vec)
    ys.append(1)
    

In [None]:
# Loop over edges that don't exist
for u in G.nodes():
    i = 0
    for v in G.nodes(): 
        if i > 30:
            break  
        if u == v:
            continue
        if G.has_edge(u,v):
            continue
        vecs.append(create_X(G,u,v, cat_dict, pr, hits))
        ys.append(0)
        i = i + 1


In [None]:
# Dump X and y with pickle

# Transform vecs from tuples to list
X_data = [list(item) for item in vecs]
pickle.dump(X_data,open('../data/X_data.pkl', 'wb'))
pickle.dump(ys,open('../data/Y_data.pkl', 'wb'))

In [None]:
#g = nx.read_edgelist("data/decoded.tsv")

#print(nx.info(g))
pd.read_csv('data/decoded.tsv',nrows=80, delimiter='\t').to_csv('data/decodedSample.tsv', sep='\t',header=False,index=False)

#reading the edgelist in a variable using networkX
subgraph=nx.read_edgelist('data/decodedSample.tsv',delimiter='\t',create_using=nx.DiGraph())
#plotting the graph
print(nx.info(subgraph))
pos=nx.spring_layout(subgraph)
nx.draw(subgraph,pos,node_color='#A0CBE2',edge_color='#00bb5e',width=1,edge_cmap=plt.cm.Blues,with_labels=True)
plt.savefig("graph_sample.pdf")