Importing libraries

In [1]:
import networkx as nx
import pandas as pd
from pickle import load

Reading the graph and the data to be tested

In [2]:
G = nx.read_gml("GraphMissingEdges.gml")
edges_te = pd.read_csv('edgesToEvaluate.csv')

In [3]:
nx.info(G)

'Graph with 4575 nodes and 18991 edges'

Receiving the nodes from data

In [4]:
edge = []
for idx in edges_te.index:
    tupla = (edges_te['venue1'][idx], edges_te['venue2'][idx]) 
    edge.append(tupla)

Calculating Jaccard Coefficient of edges on data to be tested

In [5]:
pred_jc = nx.jaccard_coefficient(G, edge)

In [6]:
jc_scores = []
node1 = []
node2 = []

for u, v, p in pred_jc:
    node1.append(u)
    node2.append(v)
    jc_scores.append(p)

df_dict = {}
df_dict['Node1'] = node1
df_dict['Node2'] = node2
df_dict['JC_score'] = jc_scores

Calculating Preferential Attachment of edges on data to be tested

In [7]:
pred_pa = nx.preferential_attachment(G, edge)

In [8]:
pa_scores = []
pa_scores = [s for (u,v,s) in pred_pa]    
df_dict['PA_score'] = pa_scores 

Calculating Resource Allocation Index of edges on data to be tested

In [9]:
pred_ra = nx.resource_allocation_index(G, edge)

In [10]:
ra_scores = []
ra_scores = [s for (u,v,s) in pred_ra]
df_dict['RA_score'] = ra_scores

Creating a dataframe based on Jaccard Coefficient, Preferential Attachment and Resource Allocation Index

In [11]:
df = pd.DataFrame(df_dict)

In [12]:
df.head()

Unnamed: 0,Node1,Node2,JC_score,PA_score,RA_score
0,mJ_ucQ2_3hfTsmCcKb-hgw,qXGKYRwCR9SLgLl0g_9o5g,0.0,320,0.0
1,y19xFolCozaRA-gGmHwkQA,F6c3D1o9Z4Tl6cDorb3WgA,0.05,108,0.037037
2,R1GwW4C1gh2Nmue9K0WYVA,Ul6JwluSTm12PVDIqnNaTg,0.008547,3160,0.013889
3,zzBa0pQjM1gov00bXjYYXg,3D6Uck9QSdxZKFstf5DGlg,0.08,140,0.030107
4,U2d-meX4sVq0kiqcrpHt1w,vuDL_d3GYAtbvX9EJQqVog,0.0,0,0.0


In [13]:
df.describe()

Unnamed: 0,JC_score,PA_score,RA_score
count,500.0,500.0,500.0
mean,0.016974,821.512,0.03445
std,0.066921,2078.997475,0.110774
min,0.0,0.0,0.0
25%,0.0,8.0,0.0
50%,0.0,74.5,0.0
75%,0.019005,652.0,0.015632
max,1.0,24192.0,1.364406


Loading the models

In [14]:
model_knn = load(open('model_knn.pkl', 'rb'))
model_clf = load(open('model_clf.pkl', 'rb'))
scaler = load(open('scaler.pkl', 'rb'))

Applying the scaler and the models 

In [15]:
feature_names = ['JC_score', 'PA_score', 'RA_score']
X = df[feature_names]
X = scaler.transform(X)

Making predictions using KNN and Decision Tree Classifier models

In [16]:
y_knn = model_knn.predict(X)
y_clf = model_clf.predict(X)

Generating a csv file with the predicted data

In [17]:
edges_te.drop(['venue1', 'venue2'], axis=1,inplace=True)

In [18]:
edges_te['KNN'] = y_knn
edges_te['DTC'] = y_clf

In [19]:
edges_te.to_csv('leila_edgesToEvaluate.csv',index=False)