# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

### 1. Import Packages

In [28]:
from importlib import reload
reload(analyseData)
reload(prepData)

<module 'util.preprocess_Data' from 'D:\\Dokumente\\2_Bildung\\2_MSc\\1_Classes\\Y2T2_Machine Learning in Network Science\\3_challenge\\Network-Science_Challenge\\util\\preprocess_Data.py'>

In [1]:
# import own scripts
import util.analyse_Data as analyseData
import util.preprocess_Data as prepData

In [2]:
# parse & handle data
import csv
import numpy as np
import pandas as pd
import networkx as nx # graph data

# modeling
from xgboost import XGBClassifier

# evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Load Data for Modeling

In [4]:
# might take up to a minute
G, G_train, train_tf, val_tf, test, test_tf, X_train, y_train, X_val, y_val, X_test = prepData.load_prep_data()

Number of positive edges for training: 4696
Number of positive edges for validation: 521
Number of edges in original graph: 5217
Number of edges in training graph: 4696
The graph is connected


### 3. Modeling

In [5]:
# basic XGBoost (best to use for RFECV as very short runtime)
clf = XGBClassifier(n_estimators = 6, max_depth = 10, eta = 0.3)
clf.fit(X_train, y_train)

# predict train
y_train_hat = clf.predict(X_train)
# compute R^2 metric
acc_train = accuracy_score(y_train, y_train_hat)

# predict val
y_val_hat = clf.predict(X_val)
# compute R^2 metric
acc_val = accuracy_score(y_val, y_val_hat)

# predict test
y_test_hat = clf.predict(X_test)

print(f"Acc train: {acc_train}")
print(f"Acc val : {acc_val}")

Acc train: 0.7734366705595074
Acc val : 0.7734225621414914


In [6]:
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Validation performance')
print('-------------------------------------------------------')
print(classification_report(y_val, y_val_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_val, y_val_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_val, y_val_hat))

Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      4723
           1       0.79      0.75      0.77      4696

    accuracy                           0.77      9419
   macro avg       0.77      0.77      0.77      9419
weighted avg       0.77      0.77      0.77      9419

Validation performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       525
           1       0.76      0.79      0.78       521

    accuracy                           0.77      1046
   macro avg       0.77      0.77      0.77      1046
weighted avg       0.77      0.77      0.77      1046

Roc_auc score
-------------------------------------------------------
0.7734814002376382

Confusion matrix
-------------------------------------------------------
[[398 127]
 [110 411]]


In [45]:
save_test = (test
    .join(test_tf.assign(Predicted = y_test_hat).Predicted)
    # missing values are entries where target == source node
    .assign(Predicted = lambda df_: df_.Predicted.mask(df_.Predicted.isna(), 1))
    # convert to int
    .assign(Predicted = lambda df_: df_.Predicted.astype(int))
    # remove useless columns
    .drop(["node1", "node2"], axis = 1, inplace = False)
)

In [46]:
save_test

Unnamed: 0,Predicted
0,0
1,0
2,0
3,0
4,0
...,...
3493,1
3494,0
3495,1
3496,0


In [47]:
# save predictions
save_test.to_csv('data/test_preds.csv', index_label = "ID")