# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

### 1. Import Packages

In [28]:
from importlib import reload
reload(analyseData)
reload(prepData)

<module 'util.preprocess_Data' from 'D:\\Dokumente\\2_Bildung\\2_MSc\\1_Classes\\Y2T2_Machine Learning in Network Science\\3_challenge\\Network-Science_Challenge\\util\\preprocess_Data.py'>

In [1]:
# import own scripts
import util.analyse_Data as analyseData
import util.preprocess_Data as prepData

In [2]:
# parse & handle data
import csv
import numpy as np
import pandas as pd
import networkx as nx # graph data

# modeling
from xgboost import XGBClassifier

# evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Load Data for Modeling

In [4]:
# might take up to a minute
G, G_train, train_tf, val_tf, test, test_tf, X_train, y_train, X_val, y_val, X_trainval, y_trainval, X_test = prepData.load_prep_data()

Number of positive edges for training: 4696
Number of positive edges for validation: 521
Number of edges in original graph: 5217
Number of edges in training graph: 4696
The graph is connected


### 3. Modeling

In [9]:
# basic XGBoost (best to use for RFECV as very short runtime)
clf = XGBClassifier(n_estimators = 100, max_depth = 5, eta = 0.05)
clf.fit(X_train, y_train)

# predict train
y_train_hat = clf.predict(X_train)
# compute R^2 metric
acc_train = accuracy_score(y_train, y_train_hat)

# predict val
y_val_hat = clf.predict(X_val)
# compute R^2 metric
acc_val = accuracy_score(y_val, y_val_hat)

# print performance
print(f"Acc train: {acc_train}")
print(f"Acc val : {acc_val}")

Acc train: 0.8944686272428071
Acc val : 0.7619502868068834


In [10]:
# detailed performance analysis
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Validation performance')
print('-------------------------------------------------------')
print(classification_report(y_val, y_val_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_val, y_val_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_val, y_val_hat))

Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      4723
           1       0.87      0.92      0.90      4696

    accuracy                           0.89      9419
   macro avg       0.90      0.89      0.89      9419
weighted avg       0.90      0.89      0.89      9419

Validation performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.72      0.86      0.78       525
           1       0.83      0.66      0.73       521

    accuracy                           0.76      1046
   macro avg       0.77      0.76      0.76      1046
weighted avg       0.77      0.76      0.76      1046

Roc_auc score
-------------------------------------------------------
0.7615702403802211

Confusion matrix
-------------------------------------------------------
[[452  73]
 [176 345]]


In [11]:
# rerun model on whole training data
clf = XGBClassifier(n_estimators = 6, max_depth = 10, eta = 0.3)
clf.fit(X_trainval, y_trainval)

# predict test
y_test_hat = clf.predict(X_test)

In [12]:
# save test predictions
save_test = prepData.save_preds(test, test_tf, y_test_hat)