## Import Libraries

In [1]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## Read Data

In [2]:
G = nx.read_gpickle('datasets/email_prediction')
future_connections = pd.read_csv('datasets/Future_Connections.csv',
                                 index_col=0, converters={0: eval})

df = future_connections.reset_index()
df = df.rename(columns={'index':'Edges'})

## Generate Features

In [3]:
df['CommonNeighbors'] = df['Edges'].apply(lambda x: 
                                          len(list(nx.common_neighbors(G,x[0],x[1]))))
df['Jaccard'] = df['Edges'].apply(lambda x: 
                                  list(nx.jaccard_coefficient(G,[x]))[0][2])
df['ResourceAllocation'] = df['Edges'].apply(lambda x: 
                                             list(nx.resource_allocation_index(G,[x]))[0][2])

for no in G.nodes(data=True):
    G.nodes[no[0]]['community'] = no[1]['Department']
   
df['CommunityCommonNeighbors'] = df['Edges'].apply(lambda x: 
                                                   list(nx.cn_soundarajan_hopcroft(G,[x]))[0][2])

## Generate Evaluation / Training Data

In [4]:
df_EVAL = df[df['Future Connection'].isnull()]
df = df[~df['Future Connection'].isnull()]

X = df[['CommonNeighbors','Jaccard','ResourceAllocation','CommunityCommonNeighbors']]
y = df['Future Connection']

## Split Training Data for Validation

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

## Fit RandomForestClassifier

In [6]:
rf = RandomForestClassifier(max_depth=2,random_state=0).fit(X_train,y_train)

## Calculate AUC Score

In [9]:
y_predict = rf.predict_proba(X_test)
auc = roc_auc_score(y_test,y_predict[:,1])
print(auc)

0.9012176024936611


## Make predictions for evaluation data

In [10]:
X_eval = df_EVAL[['CommonNeighbors','Jaccard','ResourceAllocation','CommunityCommonNeighbors']]
predictions = rf.predict_proba(X_eval)
print(rf.classes_)
pred = predictions[:,1]
df_EVAL['Future Connection'] = pred
df_EVAL = df_EVAL.set_index('Edges')
ret = df_EVAL['Future Connection']
ret.name = None
ret.index.name = None

ret

[0. 1.]


(107, 348)    0.021746
(542, 751)    0.021746
(20, 426)     0.616059
(50, 989)     0.021746
(942, 986)    0.021746
                ...   
(165, 923)    0.021746
(673, 755)    0.021746
(939, 940)    0.021746
(555, 905)    0.021746
(75, 101)     0.021746
Length: 122112, dtype: float64