In [None]:
!pip install karateclub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import json
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import train_test_split

#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
#https://stackoverflow.com/questions/38015181/accuracy-score-valueerror-cant-handle-mix-of-binary-and-continuous-target
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

#https://karateclub.readthedocs.io/en/latest/modules/root.html
from karateclub import WaveletCharacteristic
from karateclub import LDP
from karateclub import FeatherGraph
from karateclub import GeoScattering
from karateclub import IGE
from karateclub import GL2Vec
from karateclub import NetLSD
from karateclub import SF
from karateclub import FGSD
from karateclub import Graph2Vec

In [None]:
f = open('reddit_edges.json')
graph_hash = json.load(f)
df = pd.read_csv('reddit_target.csv')

#extract first 1000 X values
graph_array = list(graph_hash.values())
G = [nx.Graph(i) for i in graph_array[0:1000]]

#extract first 1000 y values
y = list(df.target)[0:1000]


In [None]:
model = GL2Vec(wl_iterations=10)
model.fit(G)
GL_X = model.get_embedding()

#WaveletCharacteristic(order: int = 5, eval_points: int = 25, theta_max: float = 2.5, tau: float = 1.0, pooling: str = 'mean')
#LDP(bins: int = 32)
#FeatherGraph(order: int = 5, eval_points: int = 25, theta_max: float = 2.5, seed: int = 42, pooling: str = 'mean')
#IGE(feature_embedding_dimensions: List[int] = [3, 5], spectral_embedding_dimensions: List[int] = [10, 20], histogram_bins: List[int] = [10, 20], seed: int = 42)
#GeoScattering(order: int = 4, moments: int = 4, seed: int = 42)
#GL2Vec(wl_iterations: int = 2, dimensions: int = 128, workers: int = 4, down_sampling: float = 0.0001, epochs: int = 10, learning_rate: float = 0.025, min_count: int = 5, seed: int = 42, erase_base_features: bool = False)
#NetLSD(scale_min: float = -2.0, scale_max: float = 2.0, scale_steps: int = 250, approximations: int = 200, seed: int = 42)
#SF(dimensions: int = 128, seed: int = 42)
#FGSD(hist_bins: int = 200, hist_range: int = 20, seed: int = 42)
#Graph2Vec(wl_iterations: int = 2, attributed: bool = False, dimensions: int = 128, workers: int = 4, down_sampling: float = 0.0001, epochs: int = 10, learning_rate: float = 0.025, min_count: int = 5, seed: int = 42, erase_base_features: bool = False)


In [None]:
model = WaveletCharacteristic()
model.fit(G)
W_X = model.get_embedding()

In [None]:
model = LDP()
model.fit(G)
LDP_X = model.get_embedding()

In [None]:
model = FeatherGraph()
model.fit(G)
F_X = model.get_embedding()

In [None]:
model = IGE()
model.fit(G)
IGE_X = model.get_embedding()

In [None]:
model = GeoScattering()
model.fit(G)
G_X = model.get_embedding()

In [None]:
model = NetLSD()
model.fit(G)
NetLSD_X = model.get_embedding()

In [None]:
model = SF()
model.fit(G)
SF_X = model.get_embedding()

In [None]:
model = FGSD()
model.fit(G)
FGSD_X = model.get_embedding()

In [None]:
model = Graph2Vec(wl_iterations=10)
model.fit(G)
Graph_X = model.get_embedding()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(GL_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
GL_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
GL_auc = roc_auc_score(y_test, y_pred)
print('GL2Vec')
print('GL_Accuracy: {:f}'.format(GL_acc))
print('GL_AUC: {:f}'.format(GL_auc))
#wl 100 AUC: 0.7445
#https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
#increase to max_iter=200 from max_iter=100

GL2Vec
GL_Accuracy: 137.000000
GL_AUC: 0.708738


In [None]:
#Winner!
X_train, X_test, y_train, y_test = train_test_split(W_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression(max_iter=500).fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
W_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
W_auc = roc_auc_score(y_test, y_pred)
print('WaveletCharacteristic')
print('W_Accuracy: {:f}'.format(W_acc))
print('W_AUC: {:f}'.format(W_auc))

WaveletCharacteristic
W_Accuracy: 154.000000
W_AUC: 0.830898


In [None]:
X_train, X_test, y_train, y_test = train_test_split(LDP_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression(max_iter=500).fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
LDP_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
LDP_auc = roc_auc_score(y_test, y_pred)
print('LDP')
print('LDP_Accuracy: {:f}'.format(LDP_acc))
print('LDP_AUC: {:f}'.format(LDP_auc))

LDP
LDP_Accuracy: 144.000000
LDP_AUC: 0.795366


In [None]:
X_train, X_test, y_train, y_test = train_test_split(F_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
F_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
F_auc = roc_auc_score(y_test, y_pred)
print('FeatherGraph')
print('F_Accuracy: {:f}'.format(F_acc))
print('F_AUC: {:f}'.format(F_auc))

FeatherGraph
F_Accuracy: 147.000000
F_AUC: 0.809979


In [None]:
#even at max_iter=1000 it does not converge
X_train, X_test, y_train, y_test = train_test_split(IGE_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
IGE_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
IGE_auc = roc_auc_score(y_test, y_pred)
print('IGE')
print('IGE_Accuracy: {:f}'.format(IGE_acc))
print('IGE_AUC: {:f}'.format(IGE_auc))

IGE
IGE_Accuracy: 143.000000
IGE_AUC: 0.784506


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X_train, X_test, y_train, y_test = train_test_split(G_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression(max_iter=500).fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
G_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
G_auc = roc_auc_score(y_test, y_pred)
print('GeoScattering')
print('G_Accuracy: {:f}'.format(G_acc))
print('G_AUC: {:f}'.format(G_auc))

GeoScattering
G_Accuracy: 155.000000
G_AUC: 0.817386


In [None]:
X_train, X_test, y_train, y_test = train_test_split(NetLSD_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
NetLSD_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
NetLSD_auc = roc_auc_score(y_test, y_pred)
print('NetLSD')
print('NetLSD_Accuracy: {:f}'.format(NetLSD_acc))
print('NetLSD_AUC: {:f}'.format(NetLSD_auc))

NetLSD
NetLSD_Accuracy: 151.000000
NetLSD_AUC: 0.810580


In [None]:
X_train, X_test, y_train, y_test = train_test_split(SF_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
SF_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
SF_auc = roc_auc_score(y_test, y_pred)
print('SF')
print('SF_Accuracy: {:f}'.format(SF_acc))
print('SF_AUC: {:f}'.format(SF_auc))

SF
SF_Accuracy: 148.000000
SF_AUC: 0.787208


In [None]:
#note even at max_iter=500 it does not converge
X_train, X_test, y_train, y_test = train_test_split(FGSD_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
FGSD_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
FGSD_auc = roc_auc_score(y_test, y_pred)
print('FGSD')
print('FGSD_Accuracy: {:f}'.format(FGSD_acc))
print('FGSD_AUC: {:f}'.format(FGSD_auc))

FGSD
FGSD_Accuracy: 146.000000
FGSD_AUC: 0.787409


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X_train, X_test, y_train, y_test = train_test_split(Graph_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
Graph_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
Graph_auc = roc_auc_score(y_test, y_pred)
print('Graph2Vec')
print('Graph_Accuracy: {:f}'.format(Graph_acc))
print('Graph_AUC: {:f}'.format(Graph_auc))

Graph2Vec
Graph_Accuracy: 124.000000
Graph_AUC: 0.637774


Winner seems to be:
**WaveletCharacteristic**
W_Accuracy: 154.000000
W_AUC: 0.830898

In [None]:
#comparing logistic to SVM. Using manual rounding to see better AUC score

In [None]:
#refresh GL2Vec
model = GL2Vec(wl_iterations=10)
model.fit(G)
GL_X = model.get_embedding()

In [None]:
# Logistic
X_train, X_test, y_train, y_test = train_test_split(GL_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict(X_test)
GL_acc = accuracy_score(y_test, y_pred, normalize=False)
GL_auc = roc_auc_score(y_test, y_pred)
print('GL2Vec')
print('GL_Accuracy: {:f}'.format(GL_acc))
print('GL_AUC: {:f}'.format(GL_auc))

GL2Vec
GL_Accuracy: 137.000000
GL_AUC: 0.684866


In [None]:
# Logistic manual round
X_train, X_test, y_train, y_test = train_test_split(GL_X, y, test_size=0.2, random_state=42)
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)[:, 1]
GL_acc = accuracy_score(y_test, y_pred.round(), normalize=False)
GL_auc = roc_auc_score(y_test, y_pred)
print('GL2Vec')
print('GL_Accuracy: {:f}'.format(GL_acc))
print('GL_AUC: {:f}'.format(GL_auc))

GL2Vec
GL_Accuracy: 137.000000
GL_AUC: 0.719848


In [None]:
# SVM
X_train, X_test, y_train, y_test = train_test_split(GL_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
GL_acc = accuracy_score(y_test, y_pred, normalize=False)
GL_auc = roc_auc_score(y_test, y_pred)
print('GL2Vec SVM')
print('GL_Accuracy: {:f}'.format(GL_acc))
print('GL_AUC: {:f}'.format(GL_auc))

GL2Vec SVM
GL_Accuracy: 137.000000
GL_AUC: 0.684566


In [None]:
#SVM ALL
X_train, X_test, y_train, y_test = train_test_split(GL_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
GL_acc = accuracy_score(y_test, y_pred, normalize=False)
GL_auc = roc_auc_score(y_test, y_pred)
print('GL2Vec SVM')
print('GL_Accuracy: {:f}'.format(GL_acc))
print('GL_AUC: {:f}'.format(GL_auc))

X_train, X_test, y_train, y_test = train_test_split(W_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
W_acc = accuracy_score(y_test, y_pred, normalize=False)
W_auc = roc_auc_score(y_test, y_pred)
print('WaveletCharacteristic SVM')
print('W_Accuracy: {:f}'.format(W_acc))
print('W_AUC: {:f}'.format(W_auc))

X_train, X_test, y_train, y_test = train_test_split(LDP_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
LDP_acc = accuracy_score(y_test, y_pred, normalize=False)
LDP_auc = roc_auc_score(y_test, y_pred)
print('LDP SVM')
print('LDP_Accuracy: {:f}'.format(LDP_acc))
print('LDP_AUC: {:f}'.format(LDP_auc))

X_train, X_test, y_train, y_test = train_test_split(F_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
F_acc = accuracy_score(y_test, y_pred, normalize=False)
F_auc = roc_auc_score(y_test, y_pred)
print('FeatherGraph SVM')
print('F_Accuracy: {:f}'.format(F_acc))
print('F_AUC: {:f}'.format(F_auc))

X_train, X_test, y_train, y_test = train_test_split(IGE_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
IGE_acc = accuracy_score(y_test, y_pred, normalize=False)
IGE_auc = roc_auc_score(y_test, y_pred)
print('IGE SVM')
print('IGE_Accuracy: {:f}'.format(IGE_acc))
print('IGE_AUC: {:f}'.format(IGE_auc))

X_train, X_test, y_train, y_test = train_test_split(G_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
G_acc = accuracy_score(y_test, y_pred, normalize=False)
G_auc = roc_auc_score(y_test, y_pred)
print('GeoScattering SVM')
print('G_Accuracy: {:f}'.format(G_acc))
print('G_AUC: {:f}'.format(G_auc))

X_train, X_test, y_train, y_test = train_test_split(NetLSD_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
NetLSD_acc = accuracy_score(y_test, y_pred, normalize=False)
NetLSD_auc = roc_auc_score(y_test, y_pred)
print('NetLSD SVM')
print('NetLSD_Accuracy: {:f}'.format(NetLSD_acc))
print('NetLSD_AUC: {:f}'.format(NetLSD_auc))

X_train, X_test, y_train, y_test = train_test_split(SF_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
SF_acc = accuracy_score(y_test, y_pred, normalize=False)
SF_auc = roc_auc_score(y_test, y_pred)
print('SF SVM')
print('SF_Accuracy: {:f}'.format(SF_acc))
print('SF_AUC: {:f}'.format(SF_auc))

X_train, X_test, y_train, y_test = train_test_split(FGSD_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
FGSD_acc = accuracy_score(y_test, y_pred, normalize=False)
FGSD_auc = roc_auc_score(y_test, y_pred)
print('FGSD SVM')
print('FGSD_Accuracy: {:f}'.format(FGSD_acc))
print('FGSD_AUC: {:f}'.format(FGSD_auc))

X_train, X_test, y_train, y_test = train_test_split(Graph_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
Graph_acc = accuracy_score(y_test, y_pred, normalize=False)
Graph_auc = roc_auc_score(y_test, y_pred)
print('Graph2Vec SVM')
print('Graph_Accuracy: {:f}'.format(Graph_acc))
print('Graph_AUC: {:f}'.format(Graph_auc))

GL2Vec SVM
GL_Accuracy: 137.000000
GL_AUC: 0.684566
WaveletCharacteristic SVM
W_Accuracy: 151.000000
W_AUC: 0.754029
LDP SVM
LDP_Accuracy: 154.000000
LDP_AUC: 0.767691
FeatherGraph SVM
F_Accuracy: 149.000000
F_AUC: 0.743719
IGE SVM
IGE_Accuracy: 125.000000
IGE_AUC: 0.619107
GeoScattering SVM
G_Accuracy: 129.000000
G_AUC: 0.634621
NetLSD SVM
NetLSD_Accuracy: 142.000000
NetLSD_AUC: 0.703433
SF SVM
SF_Accuracy: 146.000000
SF_AUC: 0.728856
FGSD SVM
FGSD_Accuracy: 145.000000
FGSD_AUC: 0.724002
Graph2Vec SVM
Graph_Accuracy: 126.000000
Graph_AUC: 0.633570


In [None]:
#SVM ALL Normalized
X_train, X_test, y_train, y_test = train_test_split(GL_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
GL_acc = accuracy_score(y_test, y_pred, normalize=True)
GL_auc = roc_auc_score(y_test, y_pred)
print('GL2Vec SVM')
print('GL_Accuracy: {:f}'.format(GL_acc))
print('GL_AUC: {:f}'.format(GL_auc))

X_train, X_test, y_train, y_test = train_test_split(W_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
W_acc = accuracy_score(y_test, y_pred, normalize=True)
W_auc = roc_auc_score(y_test, y_pred)
print('WaveletCharacteristic SVM')
print('W_Accuracy: {:f}'.format(W_acc))
print('W_AUC: {:f}'.format(W_auc))

X_train, X_test, y_train, y_test = train_test_split(LDP_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
LDP_acc = accuracy_score(y_test, y_pred, normalize=True)
LDP_auc = roc_auc_score(y_test, y_pred)
print('LDP SVM')
print('LDP_Accuracy: {:f}'.format(LDP_acc))
print('LDP_AUC: {:f}'.format(LDP_auc))

X_train, X_test, y_train, y_test = train_test_split(F_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
F_acc = accuracy_score(y_test, y_pred, normalize=True)
F_auc = roc_auc_score(y_test, y_pred)
print('FeatherGraph SVM')
print('F_Accuracy: {:f}'.format(F_acc))
print('F_AUC: {:f}'.format(F_auc))

X_train, X_test, y_train, y_test = train_test_split(IGE_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
IGE_acc = accuracy_score(y_test, y_pred, normalize=True)
IGE_auc = roc_auc_score(y_test, y_pred)
print('IGE SVM')
print('IGE_Accuracy: {:f}'.format(IGE_acc))
print('IGE_AUC: {:f}'.format(IGE_auc))

X_train, X_test, y_train, y_test = train_test_split(G_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
G_acc = accuracy_score(y_test, y_pred, normalize=True)
G_auc = roc_auc_score(y_test, y_pred)
print('GeoScattering SVM')
print('G_Accuracy: {:f}'.format(G_acc))
print('G_AUC: {:f}'.format(G_auc))

X_train, X_test, y_train, y_test = train_test_split(NetLSD_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
NetLSD_acc = accuracy_score(y_test, y_pred, normalize=True)
NetLSD_auc = roc_auc_score(y_test, y_pred)
print('NetLSD SVM')
print('NetLSD_Accuracy: {:f}'.format(NetLSD_acc))
print('NetLSD_AUC: {:f}'.format(NetLSD_auc))

X_train, X_test, y_train, y_test = train_test_split(SF_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
SF_acc = accuracy_score(y_test, y_pred, normalize=True)
SF_auc = roc_auc_score(y_test, y_pred)
print('SF SVM')
print('SF_Accuracy: {:f}'.format(SF_acc))
print('SF_AUC: {:f}'.format(SF_auc))

X_train, X_test, y_train, y_test = train_test_split(FGSD_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
FGSD_acc = accuracy_score(y_test, y_pred, normalize=True)
FGSD_auc = roc_auc_score(y_test, y_pred)
print('FGSD SVM')
print('FGSD_Accuracy: {:f}'.format(FGSD_acc))
print('FGSD_AUC: {:f}'.format(FGSD_auc))

X_train, X_test, y_train, y_test = train_test_split(Graph_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
Graph_acc = accuracy_score(y_test, y_pred, normalize=True)
Graph_auc = roc_auc_score(y_test, y_pred)
print('Graph2Vec SVM')
print('Graph_Accuracy: {:f}'.format(Graph_acc))
print('Graph_AUC: {:f}'.format(Graph_auc))

GL2Vec SVM
GL_Accuracy: 0.685000
GL_AUC: 0.684566
WaveletCharacteristic SVM
W_Accuracy: 0.755000
W_AUC: 0.754029
LDP SVM
LDP_Accuracy: 0.770000
LDP_AUC: 0.767691
FeatherGraph SVM
F_Accuracy: 0.745000
F_AUC: 0.743719
IGE SVM
IGE_Accuracy: 0.625000
IGE_AUC: 0.619107
GeoScattering SVM
G_Accuracy: 0.645000
G_AUC: 0.634621
NetLSD SVM
NetLSD_Accuracy: 0.710000
NetLSD_AUC: 0.703433
SF SVM
SF_Accuracy: 0.730000
SF_AUC: 0.728856
FGSD SVM
FGSD_Accuracy: 0.725000
FGSD_AUC: 0.724002
Graph2Vec SVM
Graph_Accuracy: 0.630000
Graph_AUC: 0.633570


Winner for SVM seems to be LDP but a close second is the winner from logistic regression (WaveletCharacteristic). Because Logistic regression was not converging for some techniques, going to go with SVM. The winner seems to be LDP only due to tuning reasons. LDP only has 1 factor. If I tune WaveletCharacteristic I can probably get a higher score. 

In [None]:
#WaveletCharacteristic(order: int = 5, eval_points: int = 25, theta_max: float = 2.5, tau: float = 1.0, pooling: str = 'mean')
#LDP(bins: int = 32)
#FeatherGraph(order: int = 5, eval_points: int = 25, theta_max: float = 2.5, seed: int = 42, pooling: str = 'mean')
#IGE(feature_embedding_dimensions: List[int] = [3, 5], spectral_embedding_dimensions: List[int] = [10, 20], histogram_bins: List[int] = [10, 20], seed: int = 42)
#GeoScattering(order: int = 4, moments: int = 4, seed: int = 42)
#GL2Vec(wl_iterations: int = 2, dimensions: int = 128, workers: int = 4, down_sampling: float = 0.0001, epochs: int = 10, learning_rate: float = 0.025, min_count: int = 5, seed: int = 42, erase_base_features: bool = False)
#NetLSD(scale_min: float = -2.0, scale_max: float = 2.0, scale_steps: int = 250, approximations: int = 200, seed: int = 42)
#SF(dimensions: int = 128, seed: int = 42)
#FGSD(hist_bins: int = 200, hist_range: int = 20, seed: int = 42)
#Graph2Vec(wl_iterations: int = 2, attributed: bool = False, dimensions: int = 128, workers: int = 4, down_sampling: float = 0.0001, epochs: int = 10, learning_rate: float = 0.025, min_count: int = 5, seed: int = 42, erase_base_features: bool = False)

I have tuned Graph2Vec and GL2Vec with wl_iterations = 10. The other parameters seem reasonable. Yet Graph2Vec and GL2Vec dont perform well. LDP performed well but seems to be maxed out as there are few parameters to tune. SF, FGSD, and GeoScattering all seem to be maxed out as there are few parameters to tune.
This leaves the final candidates as: WaveletCharacteristic, FeatherGraph, IGE and NetLSD. GE and NetLSD seem way off compared to WaveletCharacteristic and FeatherGraph, so will focus on tuning these two. Coincidentally, WaveletCharacteristic and FeatherGraph are the two newest algorithms. This makes sense as algorithms improve/become more expressive over time. Note that NN models doesnt perform well on tabular data unless there is a ton of rich meta data which we do not have. This is why we omitted a NN model.

Future, investigate LDP, WaveletCharacteristic and FeatherGraph

In [None]:
LDP_X # LDP
W_X # Wavelet
F_X # Feather
LDP_X_df = pd.DataFrame(LDP_X)
W_X_df = pd.DataFrame(10*W_X) #need to normalize
F_X_df = pd.DataFrame(10*F_X) #need to normalize
ensemble_df = pd.concat([LDP_X_df, W_X_df, F_X_df], axis=1, join="inner")
#ensemble_df
Ensemble_X = ensemble_df.to_numpy()
#len(Ensemble_X)

X_train, X_test, y_train, y_test = train_test_split(Ensemble_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
SF_acc = accuracy_score(y_test, y_pred, normalize=True)
SF_auc = roc_auc_score(y_test, y_pred)
print('Ensemble SVM')
print('Ensemble_Accuracy: {:f}'.format(SF_acc))
print('Ensemble_AUC: {:f}'.format(SF_auc))

Ensemble SVM
Ensemble_Accuracy: 0.780000
Ensemble_AUC: 0.776799


In [None]:
ensemble_df.shape

(1000, 1660)

In [None]:

LDP_X # LDP
W_X # Wavelet
F_X # Feather
LDP_X_df = pd.DataFrame(LDP_X)
W_X_df = pd.DataFrame(10*W_X) #need to normalize
F_X_df = pd.DataFrame(10*F_X) #need to normalize
GL_X
IGE_X
G_X
NetLSD_X
SF_X
FGSD_X
Graph_X

GL_X_df = pd.DataFrame(10*GL_X) #need to normalize
IGE_X_df = pd.DataFrame(10*IGE_X) #need to normalize
SF_X_df = pd.DataFrame(10*SF_X) #need to normalize
FGSD_X_df = pd.DataFrame(FGSD_X/10) #need to normalize
Graph_X_df = pd.DataFrame(10*Graph_X) #need to normalize

ensemble_df = pd.concat([LDP_X_df, W_X_df, F_X_df, GL_X_df, IGE_X_df, SF_X_df, FGSD_X_df, Graph_X_df], axis=1, join="inner")
#ensemble_df
Ensemble_X = ensemble_df.to_numpy()
#len(Ensemble_X)

X_train, X_test, y_train, y_test = train_test_split(Ensemble_X, y, test_size=0.2, random_state=42)
SVM_model = svm.SVC().fit(X_train, y_train)
y_pred = SVM_model.predict(X_test)
SF_acc = accuracy_score(y_test, y_pred, normalize=True)
SF_auc = roc_auc_score(y_test, y_pred)
print('Full Ensemble SVM')
print('Full Ensemble_Accuracy: {:f}'.format(SF_acc))
print('Full Ensemble_AUC: {:f}'.format(SF_auc))

Full Ensemble SVM
Full Ensemble_Accuracy: 0.625000
Full Ensemble_AUC: 0.619107


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

ensemble_tensor = torch.from_numpy(X_train)
ensemble_tensor.dtype
ensemble_tensor = ensemble_tensor.clone().to(torch.float32)


In [None]:
len(ensemble_tensor)
Xbatch = ensemble_tensor[i:i+batch_size]
Xbatch.size()

torch.Size([0, 2976])

In [None]:
class BiClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout(0.2)
        self.hidden1 = nn.Linear(ensemble_tensor.shape[1], 256)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(256, 256)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(256, 1)
        self.act_output = nn.Sigmoid()
 
    def forward(self, x):
        x = self.dropout(x)
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act_output(self.output(x))
        return x


model = BiClassifier()
 
# train the model
loss_fn   = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.0001)

n_epochs = 200
batch_size = 5

print(model)

BiClassifier(
  (dropout): Dropout(p=0.2, inplace=False)
  (hidden1): Linear(in_features=2976, out_features=256, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=256, out_features=256, bias=True)
  (act2): ReLU()
  (output): Linear(in_features=256, out_features=1, bias=True)
  (act_output): Sigmoid()
)


In [None]:
print(ensemble_tensor.shape)
print(ensemble_tensor.dtype)

torch.Size([800, 2976])
torch.float32


In [None]:
Xbatch.shape
print(Xbatch.shape)
print(Xbatch.dtype)

torch.Size([0, 2976])
torch.float32


In [None]:
y_t = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
y_t

In [None]:
ybatch = y_t[i:i+batch_size]
ybatch


tensor([], size=(0, 1))

In [None]:
y_pred

array([1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1])

In [None]:
y_t.shape

torch.Size([800, 1])

In [None]:
len(y_train)

800

In [None]:
 
n_epochs = 200
batch_size = 5

for epoch in range(n_epochs):
    for i in range(0, len(ensemble_tensor), batch_size):
        Xbatch = ensemble_tensor[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y_t[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 
# compute accuracy
y_pred = model(ensemble_tensor)
# accuracy = (y_pred.round() == y_test).float().mean()
# print(f"Accuracy {accuracy}")
 
# # make class predictions with the model
# predictions = (model(X_test) > 0.5).int()
# for i in range(5):
#     print('%s => %d (expected %d)' % (X_test[i].tolist(), predictions[i], y_test[i]))

In [None]:
# y_pred = model(ensemble_tensor)
# accuracy = (y_pred.round() == y_t).float().mean()
# print(f"Accuracy {accuracy}")

Accuracy 0.8287500143051147


In [None]:
y_numpy = y_pred.detach().numpy()
y_round = y_numpy.round()

In [None]:
SF_acc = accuracy_score(y_t, y_round, normalize=True)
SF_auc = roc_auc_score(y_t, y_round)

In [None]:
print('Full Ensemble NN')
print('Full Ensemble_Accuracy: {:f}'.format(SF_acc))
print('Full Ensemble_AUC: {:f}'.format(SF_auc))

Full Ensemble NN
Full Ensemble_Accuracy: 0.857500
Full Ensemble_AUC: 0.856986


In [None]:
# !pip install torch
# !pip install torch_geometric
# import torch
# torchversion = torch.__version__

# # Install PyTorch Scatter, PyTorch Sparse, and PyTorch Geometric
# !pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torchversion}.html
# !pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torchversion}.html
# !pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# # Numpy for matrices
# import numpy as np
# np.random.seed(0)

# # Visualization
# import networkx as nx
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

In [None]:
# import torch.nn.functional as F
# from torch.nn import Linear, Dropout
# from torch_geometric.nn import GCNConv, GATv2Conv


# class GCN(torch.nn.Module):
#   """Graph Convolutional Network"""
#   def __init__(self, dim_in, dim_h, dim_out):
#     super().__init__()
#     self.gcn1 = GCNConv(dim_in, dim_h)
#     self.gcn2 = GCNConv(dim_h, dim_out)
#     self.optimizer = torch.optim.Adam(self.parameters(),
#                                       lr=0.01,
#                                       weight_decay=5e-4)

#   def forward(self, x, edge_index):
#     h = F.dropout(x, p=0.5, training=self.training)
#     h = self.gcn1(h, edge_index)
#     h = torch.relu(h)
#     h = F.dropout(h, p=0.5, training=self.training)
#     h = self.gcn2(h, edge_index)
#     return h, F.log_softmax(h, dim=1)


# class GAT(torch.nn.Module):
#   """Graph Attention Network"""
#   def __init__(self, dim_in, dim_h, dim_out, heads=8):
#     super().__init__()
#     self.gat1 = GATv2Conv(dim_in, dim_h, heads=heads)
#     self.gat2 = GATv2Conv(dim_h*heads, dim_out, heads=1)
#     self.optimizer = torch.optim.Adam(self.parameters(),
#                                       lr=0.005,
#                                       weight_decay=5e-4)

#   def forward(self, x, edge_index):
#     h = F.dropout(x, p=0.6, training=self.training)
#     h = self.gat1(x, edge_index)
#     h = F.elu(h)
#     h = F.dropout(h, p=0.6, training=self.training)
#     h = self.gat2(h, edge_index)
#     return h, F.log_softmax(h, dim=1)

# def accuracy(pred_y, y):
#     """Calculate accuracy."""
#     return ((pred_y == y).sum() / len(y)).item()

# def train(model, data):
#     """Train a GNN model and return the trained model."""
#     criterion = torch.nn.CrossEntropyLoss()
#     optimizer = model.optimizer
#     epochs = 200

#     model.train()
#     for epoch in range(epochs+1):
#         # Training
#         optimizer.zero_grad()
#         _, out = model(data.x, data.edge_index)
#         loss = criterion(out[data.train_mask], data.y[data.train_mask])
#         acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
#         loss.backward()
#         optimizer.step()

#         # Validation
#         val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
#         val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])

#         # Print metrics every 10 epochs
#         if(epoch % 10 == 0):
#             print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: '
#                   f'{acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
#                   f'Val Acc: {val_acc*100:.2f}%')
          
#     return model

# def test(model, data):
#     """Evaluate the model on test set and print the accuracy score."""
#     model.eval()
#     _, out = model(data.x, data.edge_index)
#     acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return acc

In [None]:
# %%time

# # Create GAT model
# gat = GAT(dataset.num_features, 8, dataset.num_classes)
# print(gat)

# # Train
# train(gat, data)

# # Test
# acc = test(gat, data)
# print(f'\nGAT test accuracy: {acc*100:.2f}%\n')