# Part 2B - New Connections Prediction

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle

In [2]:
future_connections = pd.read_csv('Future_Connections.csv', index_col=0, converters={0: eval})
future_connections.head(10)

Unnamed: 0,Future Connection
"(6, 840)",0.0
"(4, 197)",0.0
"(620, 979)",0.0
"(519, 872)",0.0
"(382, 423)",0.0
"(97, 226)",1.0
"(349, 905)",0.0
"(429, 860)",0.0
"(309, 989)",0.0
"(468, 880)",0.0


## Generate graph form the given dataframe

In [3]:
indice = list(future_connections.index)

src = [index[0] for index in indice]
dst = [index[1] for index in indice]

future_connections['src'] = pd.Series(src, index=future_connections.index)
future_connections['dst'] = pd.Series(dst, index=future_connections.index)

future_connections.head()
# G = nx.from_pandas_dataframe(future_connections)

Unnamed: 0,Future Connection,src,dst
"(6, 840)",0.0,6,840
"(4, 197)",0.0,4,197
"(620, 979)",0.0,620,979
"(519, 872)",0.0,519,872
"(382, 423)",0.0,382,423


In [4]:
G = nx.from_pandas_dataframe(future_connections, 'src', 'dst', create_using=nx.DiGraph())

print(nx.info(G))

Name: 
Type: DiGraph
Number of nodes: 1005
Number of edges: 488446
Average in degree: 486.0159
Average out degree: 486.0159


In [5]:
# graph checking
print("Weakly Connected: {}".format(nx.is_weakly_connected(G)))
print("Strongly Connected: {}".format(nx.is_weakly_connected(G)))
print("Directed: {}".format(nx.is_directed(G)))

Weakly Connected: True
Strongly Connected: True
Directed: True


In [6]:
future_connections.info()

<class 'pandas.core.frame.DataFrame'>
Index: 488446 entries, (6, 840) to (75, 101)
Data columns (total 3 columns):
Future Connection    366334 non-null float64
src                  488446 non-null int64
dst                  488446 non-null int64
dtypes: float64(1), int64(2)
memory usage: 14.9+ MB


In [69]:
future_edges = pd.DataFrame()

future_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

## Th graph can use the one from Part 2A

See the post [Construction of Graph for part 2B](https://www.coursera.org/learn/python-social-network-analysis/discussions/weeks/4/threads/38KCVKcJEee6bw62IA80dA) for reference

In [7]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle

In [8]:
G = nx.read_gpickle('email_prediction.txt')

In [32]:
# Link prediction methods for {6, 480}
print("Common Neighbors: {}".format(list(nx.common_neighbors(G, 6, 480))))
print("Jaccard coefficient: {}".format(list(nx.jaccard_coefficient(G, [future_connections.index[0]]))))
print("Adamic Adar Index: {}".format(list(nx.adamic_adar_index(G, [future_connections.index[0]]))))
print("Resource Allocation: {}".format(list(nx.resource_allocation_index(G, [future_connections.index[0]]))))
print("Preferential Attachement: {}".format(list(nx.preferential_attachment(G, [future_connections.index[0]]))))

Common Neighbors: [667, 375, 184, 362, 183, 549, 418, 21, 252, 211]
Jaccard coefficient: [(6, 840, 0.07377049180327869)]
Adamic Adar Index: [(6, 840, 2.110314079181727)]
Resource Allocation: [(6, 840, 0.13672123667645245)]
Preferential Attachement: [(6, 840, 2070)]


In [46]:
future_connections['comm_neigh'] = [len(list(nx.common_neighbors(G, edge[0], edge[1]))) for edge in future_connections.index]
future_connections['jaccard'] = [list(nx.jaccard_coefficient(G, [edge]))[0][2] for edge in future_connections.index]
future_connections['adamic_adar'] = [list(nx.adamic_adar_index(G, [edge]))[0][2] for edge in future_connections.index]
future_connections['res_alloc'] = [list(nx.resource_allocation_index(G, [edge]))[0][2] for edge in future_connections.index]
future_connections['pref_attach'] = [list(nx.preferential_attachment(G, [edge]))[0][2] for edge in future_connections.index]

future_connections.head()

Unnamed: 0,Future Connection,src,dst,comm_neigh,jaccard,adamic,adamic_adar,res_alloc,pref_attach
"(6, 840)",0.0,6,840,9,0.07377,2.110314,2.110314,0.136721,2070
"(4, 197)",0.0,4,197,2,0.015504,0.363528,0.363528,0.008437,3552
"(620, 979)",0.0,620,979,0,0.0,0.0,0.0,0.0,28
"(519, 872)",0.0,519,872,2,0.060606,0.507553,0.507553,0.039726,299
"(382, 423)",0.0,382,423,0,0.0,0.0,0.0,0.0,205


## Prepare for training and predict datasets

In [47]:
df_train = future_connections.dropna()
df_pred = future_connections[np.isnan(future_connections['Future Connection'])]

In [48]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 366334 entries, (6, 840) to (771, 911)
Data columns (total 9 columns):
Future Connection    366334 non-null float64
src                  366334 non-null int64
dst                  366334 non-null int64
comm_neigh           366334 non-null int64
jaccard              366334 non-null float64
adamic               366334 non-null float64
adamic_adar          366334 non-null float64
res_alloc            366334 non-null float64
pref_attach          366334 non-null int64
dtypes: float64(5), int64(4)
memory usage: 27.9+ MB


In [49]:
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
Index: 122112 entries, (107, 348) to (75, 101)
Data columns (total 9 columns):
Future Connection    0 non-null float64
src                  122112 non-null int64
dst                  122112 non-null int64
comm_neigh           122112 non-null int64
jaccard              122112 non-null float64
adamic               122112 non-null float64
adamic_adar          122112 non-null float64
res_alloc            122112 non-null float64
pref_attach          122112 non-null int64
dtypes: float64(5), int64(4)
memory usage: 9.3+ MB


## Preprocessing for Model training with Logistic Regression

In [50]:
# selected_attrs = ['comm_neigh', 'jaccard', 'adamic_adar', 'res_alloc', 'pref_attach']
selected_attrs = ['comm_neigh', 'adamic_adar', 'pref_attach']

In [51]:
X = df_train[selected_attrs]
y = df_train['Future Connection']

In [52]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 366334 entries, (6, 840) to (771, 911)
Data columns (total 3 columns):
comm_neigh     366334 non-null int64
adamic_adar    366334 non-null float64
pref_attach    366334 non-null int64
dtypes: float64(1), int64(2)
memory usage: 11.2+ MB


In [56]:
y.describe()

count    366334.000000
mean          0.080069
std           0.271400
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: Future Connection, dtype: float64

In [58]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [59]:
# split into training and test data w/ given training dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 293067 entries, (398, 747) to (707, 905)
Data columns (total 3 columns):
comm_neigh     293067 non-null int64
adamic_adar    293067 non-null float64
pref_attach    293067 non-null int64
dtypes: float64(1), int64(2)
memory usage: 8.9+ MB


In [61]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73267 entries, (225, 382) to (523, 735)
Data columns (total 3 columns):
comm_neigh     73267 non-null int64
adamic_adar    73267 non-null float64
pref_attach    73267 non-null int64
dtypes: float64(1), int64(2)
memory usage: 2.2+ MB


In [62]:
# Traing data with Logistic Regression Model
clf = LogisticRegression()

clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
# predict the X_test
pred1 = clf.predict(X_test)
pred2 = clf.predict_proba(X_test)

print("\nAccuracy: {}".format(metrics.accuracy_score(y_test, pred1)))
print("\nROC_AUC: {}".format(metrics.roc_auc_score(y_test, pred2[:, 1])))
print("\nConfusiion matrix: \n{}".format(metrics.confusion_matrix(y_test, pred1)))
print("\nCalssification report: \n{}".format(metrics.classification_report(y_test, pred1)))


Accuracy: 0.9577162979240312

ROC_AUC: 0.9071000899915729

Confusiion matrix: 
[[66752   619]
 [ 2479  3417]]

Calssification report: 
             precision    recall  f1-score   support

        0.0       0.96      0.99      0.98     67371
        1.0       0.85      0.58      0.69      5896

avg / total       0.95      0.96      0.95     73267



In [66]:
X_pred = df_pred[selected_attrs]

In [67]:
# Generte result for autograder
pred = clf.predict_proba(X_pred)
    
rlt = pd.Series(data=pred[:, 1], index=df_pred.index)

In [68]:
rlt

(107, 348)    0.037508
(542, 751)    0.013362
(20, 426)     0.585797
(50, 989)     0.013621
(942, 986)    0.013904
(324, 857)    0.013585
(13, 710)     0.208149
(19, 271)     0.095342
(319, 878)    0.013712
(659, 707)    0.013389
(49, 843)     0.013794
(208, 893)    0.013451
(377, 469)    0.005789
(405, 999)    0.025663
(129, 740)    0.019818
(292, 618)    0.026340
(239, 689)    0.013730
(359, 373)    0.008289
(53, 523)     0.037660
(276, 984)    0.013780
(202, 997)    0.013835
(604, 619)    0.047888
(270, 911)    0.013725
(261, 481)    0.067560
(200, 450)    0.907439
(213, 634)    0.013283
(644, 735)    0.050825
(346, 553)    0.012656
(521, 738)    0.011511
(422, 953)    0.020866
                ...   
(672, 848)    0.013725
(28, 127)     0.967821
(202, 661)    0.012551
(54, 195)     0.999974
(295, 864)    0.013545
(814, 936)    0.013322
(839, 874)    0.013904
(139, 843)    0.013567
(461, 544)    0.010757
(68, 487)     0.010629
(622, 932)    0.013482
(504, 936)    0.016694
(479, 528) 

In [72]:
def new_connections_predictions(debug=False):
    
    # Your Code Here
    from sklearn import preprocessing
    from sklearn import metrics
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split

    # Preprocessing to add various Link Prediction measures
    # Not all of them used due to exution time limitation
    future_connections['comm_neigh'] = [len(list(
        nx.common_neighbors(G, edge[0], edge[1]))) for edge in future_connections.index]
    future_connections['jaccard'] = [list(
        nx.jaccard_coefficient(G, [edge]))[0][2] for edge in future_connections.index]
    future_connections['adamic_adar'] = [list(
        nx.adamic_adar_index(G, [edge]))[0][2] for edge in future_connections.index]
    future_connections['res_alloc'] = [list(
        nx.resource_allocation_index(G, [edge]))[0][2] for edge in future_connections.index]
    future_connections['pref_attach'] = [list(
        nx.preferential_attachment(G, [edge]))[0][2] for edge in future_connections.index]
    
    # split whole dataset into training and predict datasets via NaN in Future Connection column
    df_train = future_connections.dropna()
    df_pred = future_connections[np.isnan(future_connections['Future Connection'])]
    
    # Feature selection
    selected_attrs = ['comm_neigh', 'jaccard', 'adamic_adar', 'res_alloc', 'pref_attach']
#     selected_attrs = ['comm_neigh', 'adamic_adar', 'pref_attach']
    
    # Processing training dataset into X and y datasets
    X = df_train[selected_attrs]
    y = df_train['Future Connection']
    
    # Geneerate model model training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Traing data with Logistic Regression Model
    clf = LogisticRegression()

    clf.fit(X_train, y_train)
    
    if debug: 
        # predict the X_test
        pred1 = clf.predict(X_test)
        pred2 = clf.predict_proba(X_test)

        print("\nAccuracy: {}".format(metrics.accuracy_score(y_test, pred1)))
        print("\nROC_AUC: {}".format(metrics.roc_auc_score(y_test, pred2[:, 1])))
        print("\nConfusiion matrix: \n{}".format(metrics.confusion_matrix(y_test, pred1)))
        print("\nCalssification report: \n{}".format(metrics.classification_report(y_test, pred1)))
    
    # Generate prediction data
    X_pred = df_pred[selected_attrs]
    
    # Generte result for autograder
    pred = clf.predict_proba(X_pred)
    
    rlt = pd.Series(data=pred[:, 1], index=df_pred.index)
    
    return rlt # Your Answer Here

new_connections_predictions(True)


Accuracy: 0.9577162979240312

ROC_AUC: 0.9071000899915729

Confusiion matrix: 
[[66752   619]
 [ 2479  3417]]

Calssification report: 
             precision    recall  f1-score   support

        0.0       0.96      0.99      0.98     67371
        1.0       0.85      0.58      0.69      5896

avg / total       0.95      0.96      0.95     73267



(107, 348)    0.037508
(542, 751)    0.013362
(20, 426)     0.585797
(50, 989)     0.013621
(942, 986)    0.013904
(324, 857)    0.013585
(13, 710)     0.208149
(19, 271)     0.095342
(319, 878)    0.013712
(659, 707)    0.013389
(49, 843)     0.013794
(208, 893)    0.013451
(377, 469)    0.005789
(405, 999)    0.025663
(129, 740)    0.019818
(292, 618)    0.026340
(239, 689)    0.013730
(359, 373)    0.008289
(53, 523)     0.037660
(276, 984)    0.013780
(202, 997)    0.013835
(604, 619)    0.047888
(270, 911)    0.013725
(261, 481)    0.067560
(200, 450)    0.907439
(213, 634)    0.013283
(644, 735)    0.050825
(346, 553)    0.012656
(521, 738)    0.011511
(422, 953)    0.020866
                ...   
(672, 848)    0.013725
(28, 127)     0.967821
(202, 661)    0.012551
(54, 195)     0.999974
(295, 864)    0.013545
(814, 936)    0.013322
(839, 874)    0.013904
(139, 843)    0.013567
(461, 544)    0.010757
(68, 487)     0.010629
(622, 932)    0.013482
(504, 936)    0.016694
(479, 528) 