In [11]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle
print(nx.__version__)

1.11


---

## Company Emails

We analyze a company's email network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people. Note that these edges are undirected.


### Salary Prediction

Using this network `G`, we identify the people in the network with missing values for the node attribute `ManagementSalary` and predict whether or not these individuals are receiving a management position salary.

In [12]:
G = nx.read_gpickle('email_prediction.txt')

print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1005
Number of edges: 16706
Average degree:  33.2458


The network G also contains the node attributes `Department` and `ManagementSalary`.

`Department` indicates the department in the company which the person belongs to, and `ManagementSalary` indicates whether that person is receiving a management position salary.

In [13]:
def createDB():
    # Setup DB
    df = pd.DataFrame(G.nodes(data=True), columns=['node', 'data'])
    df['Department'] = df['data'].map(lambda x: x['Department'])
    df['ManagementSalary'] = df['data'].map(lambda x: x['ManagementSalary'])
    df = df[['Department', 'ManagementSalary']]
    return df
df = createDB()
df.head()

Unnamed: 0,Department,ManagementSalary
0,1,0.0
1,1,
2,21,
3,21,1.0
4,21,1.0


To accomplish this salary prediction, we create a matrix of node features using networkx.<br> 
We use the following features:<br>
 -  clustering
 -  degree
 -  closeness
 -  betweenness
 -  isolated
 -  center
 -  eccentricity
 -  department

In [41]:
def numfeatureExtraction(df):
    ############################################################
    # Create the features and put into DB
    df['clustering'] = pd.Series(nx.clustering(G))
    df['degree'] = pd.Series(G.degree())
    df['closeness'] = pd.Series(nx.closeness_centrality(G))
    dicc = nx.betweenness_centrality(G)
    df['betweenness'] = df.apply(lambda row: dicc[row.name], axis=1)
    ############################################################
    #
    # The graph is mostly connected
    # [setter for setter in nx.connected_components(G)]
    # the above line shows that all of the nodes except for 19 are in the same connected component
    # the remaining 19 have no edges connecting to any other nodes
    # these will be considered by the degrees of each node
    df['isolated'] = df['degree'].map(lambda x: 1 if x==0 else 0)
    
    # We check whether the node is in the center of the largest connected subgraph
    setter = next(nx.connected_components(G))
    H = G.subgraph(setter)
    centerStage = set(nx.center(H))
    df['center'] = pd.Series([node in centerStage for node in G.nodes()])
    ############################################################
    ecc = nx.eccentricity(H)
    df['eccentricity'] = df.apply(lambda row: ecc[row.name] if row.name in ecc else 10, axis=1)
    ############################################################
    return df

def catfeatureExtraction(df):
    ############################################################
    # Setup the Department column as categorical variables
    inputter = np.array(df['Department'].values, dtype=int).reshape([len(df),1])
    enc = OneHotEncoder()
    departmenter = enc.fit_transform(inputter).toarray()
    ############################################################
    # scale the numerical data
    dfdata = preprocessing.scale(np.array(df[['clustering', 'degree', 'closeness',
                                                     'betweenness', 'eccentricity']]))
    ############################################################
    # add together the categorical variables from the one hot vector encoder
    # and the numerical data
    featurematrix = np.concatenate((departmenter, np.array(df[['center', 'isolated']]), dfdata), axis=1)
    return featurematrix

In [47]:
df=createDB()
df=numfeatureExtraction(df)
df.head()

Unnamed: 0,Department,ManagementSalary,clustering,degree,closeness,betweenness,isolated,center,eccentricity
0,1,0.0,0.276423,44,0.421991,0.001124,0,True,4
1,1,,0.265306,52,0.42236,0.001195,0,True,4
2,21,,0.297803,95,0.46149,0.00657,0,False,5
3,21,1.0,0.38491,71,0.441663,0.001654,0,False,5
4,21,1.0,0.318691,96,0.462152,0.005547,0,False,5


A sklearn classifier is trained on nodes that have `ManagementSalary` data, and we predict a probability of the node receiving a management salary for nodes where `ManagementSalary` is missing.

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import linear_model, preprocessing
from sklearn.preprocessing import OneHotEncoder

def trainTestSets():
    # Get the matrix of features created from the previous functions
    df=createDB()
    df=numfeatureExtraction(df)
    featurematrix=catfeatureExtraction(df)
    
    # Determine location of known and unkown salaries
    knownLocations = [df.index[i] for i in range(len(df)) if ~np.isnan(df['ManagementSalary'].iloc[i])]
    unknownLocations = [df.index[i] for i in range(len(df)) if np.isnan(df['ManagementSalary'].iloc[i])]
  
    # Create training and testing datasets with y values of 0 or 1 depending on the ManagementSalary
    X = featurematrix[knownLocations,:]
    ###########################################################
    # Recover y values to evaluate model via AUC
    y = df['ManagementSalary'].iloc[knownLocations].values
    ###########################################################   
    # Create the appropriate unkown X values to test on
    X_test = featurematrix[unknownLocations,:]
    
    return X, y, X_test

def salary_predictions(X, y, X_test):
    ###########################################################
    # We train the classifier on all of X and y
    logreg = linear_model.LogisticRegression()
    logreg.fit(X, y)
   
    y_scores = logreg.predict_proba(X_test)
    return y_scores[:,1]


The evaluation metric for this model is the Area Under the ROC Curve (AUC).

In [45]:
#################################################
def checkAUC():
    X, y, Xunknown = trainTestSets()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X_train, y_train)
    y_scores = logreg.predict_proba(X_test)
    return roc_auc_score(y_test, y_scores[:,1])
#################################################
checkAUC()
#0.9280437756497948

0.9280437756497948

The predictions are given as the probability that the corresponding employee is receiving a management position salary. Using the trained classifier, a series of the probability of receiving management salary, and the index being the node id is returned.

In [44]:
X, y, X_test = trainTestSets()
predictions = salary_predictions(X, y, X_test)
predictions

array([2.77194337e-02, 8.99534183e-01, 9.99999458e-01, 6.27393362e-02,
       1.95784472e-01, 1.67156013e-01, 3.49956155e-01, 3.84114325e-01,
       1.33137369e-01, 1.29071195e-01, 4.71168478e-02, 9.81086808e-02,
       1.05824777e-01, 2.80043221e-01, 6.15728473e-01, 4.37126001e-02,
       9.99937405e-01, 9.74341991e-01, 4.99869980e-02, 6.94722137e-02,
       2.37560329e-02, 1.13782242e-02, 1.45774329e-01, 6.77120603e-02,
       7.80067519e-01, 4.65907987e-03, 3.48270547e-01, 7.96577498e-01,
       5.78289484e-02, 4.14471192e-01, 1.06047140e-01, 2.55944441e-01,
       9.79341811e-02, 2.10531806e-02, 1.10259652e-01, 3.54950213e-01,
       7.12622611e-02, 8.60781937e-02, 8.09471978e-03, 5.30186418e-02,
       6.62044740e-02, 4.98256009e-01, 6.44305379e-01, 1.45773285e-01,
       1.70914738e-01, 1.59016783e-02, 7.21364413e-02, 8.37891839e-03,
       9.57917890e-01, 1.67221589e-01, 4.88623783e-01, 1.62664618e-01,
       6.16504848e-01, 5.61822786e-01, 2.51656645e-02, 7.37887282e-02,
      