## Import Libraries

In [1]:
import networkx as nx
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Read Data

In [2]:
G = nx.read_gpickle('datasets/email_prediction')

df = pd.DataFrame(G.nodes(data=True))
df['Department'] = df[1].map(lambda x:x['Department'])
df['ManagementSalary'] = df[1].map(lambda x:x['ManagementSalary'])

## Generate Features

In [3]:
df['Clustering'] = pd.Series(nx.clustering(G))
df['DegreeCentrality'] = pd.Series(nx.degree_centrality(G))
df['ClosenessCentrality'] = pd.Series(nx.closeness_centrality(G))
df['BetweennessCentrality'] = pd.Series(nx.betweenness_centrality(G,normalized=True,endpoints=False))

del df[0]
del df[1]

## Generate Evaluation / Training Data

In [4]:
df_EVAL = df[df['ManagementSalary'].isnull()]
df = df[~df['ManagementSalary'].isnull()]

X = df[['Clustering','DegreeCentrality','ClosenessCentrality','BetweennessCentrality']]
y = df['ManagementSalary']

## Split Training Data for validation

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

## Fit Random Forest Classifier

In [6]:
rf = RandomForestClassifier(max_depth=2,random_state=0).fit(X,y)

## Calculate AUC Score

In [7]:
y_predict = rf.predict_proba(X_test)
auc = roc_auc_score(y_test,y_predict[:,1])
print(auc)

0.9479035639412998


## Make prediction for evaluation data

In [8]:
X = df_EVAL[['Clustering','DegreeCentrality','ClosenessCentrality','BetweennessCentrality']]
predictions = rf.predict_proba(X)
print(rf.classes_)
pred = predictions[:,1]
df_EVAL['ManagementSalary'] = pred
ret = df_EVAL['ManagementSalary']
ret.name = None

ret

[0. 1.]


1       0.156986
2       0.871266
5       0.980515
8       0.143872
14      0.206239
          ...   
992     0.041342
994     0.041342
996     0.041342
1000    0.053944
1001    0.053944
Length: 252, dtype: float64