In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle
print(nx.__version__)

1.11


---

## Company Emails

For the second part of this assignment you will be workking with a company's email network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people.

The network also contains the node attributes `Department` and `ManagementSalary`.

`Department` indicates the department in the company which the person belongs to, and `ManagementSalary` indicates whether that person is receiving a management position salary.

In [2]:
G = nx.read_gpickle('email_prediction.txt')

print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1005
Number of edges: 16706
Average degree:  33.2458


### Salary Prediction

Using network `G`, identify the people in the network with missing values for the node attribute `ManagementSalary` and predict whether or not these individuals are receiving a management position salary.

To accomplish this, you will need to create a matrix of node features using networkx, train a sklearn classifier on nodes that have `ManagementSalary` data, and predict a probability of the node receiving a management salary for nodes where `ManagementSalary` is missing.



Your predictions will need to be given as the probability that the corresponding employee is receiving a management position salary.

The evaluation metric for this assignment is the Area Under the ROC Curve (AUC).

Your grade will be based on the AUC score computed for your classifier. A model which with an AUC of 0.88 or higher will receive full points, and with an AUC of 0.82 or higher will pass (get 80% of the full points).

Using your trained classifier, return a series of length 252 with the data being the probability of receiving management salary, and the index being the node id.

    Example:
    
        1       1.0
        2       0.0
        5       0.8
        8       1.0
            ...
        996     0.7
        1000    0.5
        1001    0.0
        Length: 252, dtype: float64

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import linear_model, preprocessing
from sklearn.preprocessing import OneHotEncoder

def salary_predictionsDummy():
    ########################################################
    # Dummy function
    nan_mgr_sal = [i[0] for i in nx.get_node_attributes(G, 'ManagementSalary').items() if np.isnan(i[1])]
    import random
    random.seed(0)
    predictions = [random.uniform(0,1) for m in nan_mgr_sal]
    return pd.Series(predictions, index=nan_mgr_sal)

def salary_predictions():
    # Setup DB
    df = pd.DataFrame(G.nodes(data=True), columns=['node', 'data'])
    df['Department'] = df['data'].map(lambda x: x['Department'])
    df['ManagementSalary'] = df['data'].map(lambda x: x['ManagementSalary'])
    df = df[['Department', 'ManagementSalary']]
   
    ###########################################################
    # Setup the Department column as categorical variables
    inputter = np.array(df['Department'].values, dtype=int).reshape([len(df),1])
    enc = OneHotEncoder()
    departmenter = enc.fit_transform(inputter).toarray()
    ############################################################
    # Create the features and put into DB
    df['clustering'] = pd.Series(nx.clustering(G))
    df['degree'] = pd.Series(G.degree())
    df['closeness'] = pd.Series(nx.closeness_centrality(G))
    dicc = nx.betweenness_centrality(G)
    df['betweenness'] = df.apply(lambda row: dicc[row.name], axis=1)
  
    # The graph is mostly connected
    # [setter for setter in nx.connected_components(G)]
    # the above line shows that all of the nodes except for 19 are in the same connected component
    # the remaining 19 have no edges connecting to any other nodes
    # these will be considered by the degrees of each node
    df['isolated'] = df['degree'].map(lambda x: 1 if x==0 else 0)
  
    # We check whether the node is in the center of the largest connected subgraph
    setter = next(nx.connected_components(G))
    H = G.subgraph(setter)
    centerStage = set(nx.center(H))
    df['center'] = pd.Series([node in centerStage for node in G.nodes()])
  
    ecc = nx.eccentricity(H)
    df['eccentricity'] = df.apply(lambda row: ecc[row.name] if row.name in ecc else 10, axis=1)
    ############################################################
  
    # Determine list of important indices
    knownLocations = [df.index[i] for i in range(len(df)) if ~np.isnan(df['ManagementSalary'].iloc[i])]
    unknownLocations = [df.index[i] for i in range(len(df)) if np.isnan(df['ManagementSalary'].iloc[i])]
  
    # Create training and testing datasets with y values of 0 or 1 depending on the ManagementSalary
    # We do not include the Department or ManagementSalary in the training data
    initial = np.array(df[['clustering', 'degree',
                     'closeness', 'betweenness', 'isolated', 'center', 'eccentricity']].iloc[knownLocations].values)
    #print('X has shape {} but departmenter has shape {}'.format(X.shape, departmenter.shape))
    # add in the categorical variables from the one hot vector encoder
    #initial, X, departmenter, isknown
    X = np.concatenate((departmenter[knownLocations,:], initial), axis=1)
    ###########################################################

    y = df['ManagementSalary'].iloc[knownLocations].values
    # The autograder does not accept the below line
    #y = df['ManagementSalary'][isKnown].values
    ###########################################################
    # Setup the output for the autograder
    # We train the classifier on all of X and y
    logreg = linear_model.LogisticRegression()
    X_scaled = preprocessing.scale(X)
    logreg.fit(X_scaled, y)
   
    # Create the appropriate unkown X values to test on
    X_tester = np.array(df[['clustering', 'degree',
                     'closeness', 'betweenness', 'isolated', 'center', 'eccentricity']].iloc[unknownLocations].values)
    # add in the categorical variables from the one hot vector encoder
    X_tester = np.concatenate((departmenter[unknownLocations,:], X_tester), axis=1)

    X_test_scaled = preprocessing.scale(X_tester)
    y_scores = logreg.predict_proba(X_test_scaled)
    predictions = [number for number in y_scores[:,1]]
    return pd.Series(predictions, index=unknownLocations)
    #return initial, X, departmenter, isKnown, df
#return pd.Series(y_scores[:,1], index = df.index.values[unKnown])
#initial, X, departmenter, isKnown, df = salary_predictionsInitial()

In [4]:
predictions = salary_predictions()
predictions



1       0.018192
2       0.981830
5       1.000000
8       0.051306
14      0.374254
18      0.176401
27      0.509803
30      0.508774
31      0.142980
34      0.113386
37      0.039810
40      0.114883
45      0.098810
54      0.285742
55      0.716885
60      0.042468
62      1.000000
65      0.998829
77      0.010766
79      0.016487
97      0.001344
101     0.000734
103     0.156416
108     0.038429
113     0.965644
122     0.001735
141     0.617988
142     0.955746
144     0.007743
145     0.486072
          ...   
913     0.006885
914     0.064384
915     0.000088
918     0.018793
923     0.001921
926     0.021571
931     0.010440
934     0.001231
939     0.005447
944     0.000142
945     0.003796
947     0.095528
950     0.078005
951     0.002940
953     0.013593
959     0.000173
962     0.000032
963     0.063692
968     0.044874
969     0.049247
974     0.013091
984     0.000994
987     0.120678
989     0.011280
991     0.054031
992     0.000344
994     0.000441
996     0.0000

In [8]:
#################################################
def checkAUC():
    X_scaled, y = new_connections_predictions()
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X_train, y_train)
    y_scores = logreg.predict_proba(X_test)
    return roc_auc_score(y_test, y_scores[:,1])
#################################################
checkAUC()
#0.90767685135746279

ValueError: too many values to unpack (expected 2)