In [None]:
# --- Standard library ---
import os
import sys

# --- Network science ---
import networkx as nx

# --- Scientific computing ---
import numpy as np

# --- Data handling and visualization ---
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
%matplotlib inline

import seaborn as sns

# --- Miscelleaneous ---
from tqdm.auto import tqdm

# --- Project source code ---
## PATH adjustments
FP_ROOT = os.path.join("..", "..", "")
FP_SRC = os.path.join(FP_ROOT, "src", "")
sys.path.append(FP_ROOT)
sys.path.append(FP_SRC)

## Data
from src.data import caches

## Classifiers
from src.classifiers.features import *  # degree feature calculations, label wrapper
from src.classifiers.logreg import *  # wrappers for scikit-learn logistic regression model functions

In [None]:
"""
1. Load data
2. Form feature matrix
3. Train classifier
4. Test classifier
5. Save performance data to disk
"""

In [None]:
# Load data
with open(f"../../data/input/SYSLFR/caches/{FILEHANDLE}", 'rb') as _fh:
    cache = pickle.load(_fh)

In [None]:
# Form feature matrix
# AGH many careful pre-processing steps. Check unpushed code at home.
# Get training and test edges
# TODO
training_edges = {}
test_edges = {}

# Get degree feature
# TODO
src_G, tgt_G = get_degrees(rem_G, training_edges)
src_H, tgt_H = get_degrees(rem_H, training_edges)
deg_prod_G_train = src_G * tgt_G
deg_prod_H_train = src_H * tgt_H
feature_degs_train = as_configuration(deg_prod_G_train, deg_prod_H_train)

src_G, tgt_G = get_degrees(rem_G, test_edges)
src_H, tgt_H = get_degrees(rem_H, test_edges)
deg_prod_G_test = src_G * tgt_G
deg_prod_H_test = src_H * tgt_H
feature_degs_test = as_configuration(deg_prod_G_test, deg_prod_H_test)

# Get distances feature
# TODO
dists_G_train = get_distances(emb_G, training_edges)
dists_H_train = get_distances(emb_H, training_edges)
dists_G_test = get_distances(emb_G, test_edges)
dists_H_test = get_distances(emb_H, test_edges)
feature_dists_train = as_configuration(dists_G_train, dists_H_train)
feature_dists_test = as_configuration(dists_G_test, dists_H_test)


# Form feature matrix
X_train = format_feature_matrix((feature_degs_train, feature_dists_train))
X_test = format_feature_matrix((feature_degs_test, feature_dists_test))

# Get labels
Y_train = get_labels(training_edges)
Y_test = get_labels(test_edges)

In [None]:
# Train classifier
logreg_params = {
    "fit_intercept": True,  # include intercept accounting for class imbalance
    "solver": "newton-cholesky",  # uses Cholesky-decomposition to approximate Hessian for Newton-esque method
    "penalty": None  # do not include coefficient regularization
}
model = LogReg("LogReg", ("deg", "emb"), dict(), X_train, Y_train, logreg_params)

In [None]:
# Test classifier
scored_labels = model.get_scores(X_test)
predicted_labels = model.get_reconstruction(X_test)
true_labels = Y_true
accuracy = performance(scored_labels, predicted_labels, true_labels, "accuracy")
auroc = performance(scored_labels, predicted_labels, true_labels, "auroc")
pr = performance(scored_labels, predicted_labels, true_labels, "pr")