In [2]:
import pickle
import os
import sys
import pandas as pd

from tqdm.notebook import tqdm

sys.path.append("../../")
sys.path.append("../../src/")
from src.data.caches import CachedEmbeddings
from src.embed import helpers
from src.classifiers import logreg as lr
from src.classifiers import features

In [3]:
LOGREG = {
    "penalty": None,
    "solver": "newton-cg",
}

# Once-thru

In [4]:
with open("../../data/input/SYSLFR/caches/method-N2V_percomponent-False_dim-128_embrep-0_remnants_theta-0.05_strategy-RANDOM_remrep-1_edgelists_name-LFR_N-10000_T1-2.1_T2-1.0_kavg-20.0-kmax-100_mu-0.1_prob-1.0_rep-1.pkl", "rb") as _fh:
    cache = pickle.load(_fh)
with open("../../data/input/SYSLFR/edgelists/edgelists_name-LFR_N-10000_T1-2.1_T2-1.0_kavg-20.0-kmax-100_mu-0.1_prob-1.0_rep-1.pkl", "rb") as _fh:
    mplx = pickle.load(_fh)

In [5]:
fixed_observed_edges = {}
for edge in cache.observed_edges:
    if mplx[0].has_edge(*edge):
        fixed_observed_edges[edge] = 1
    else:
        fixed_observed_edges[edge] = 0

fixed_unobserved_edges = {}
for edge in cache.unobserved_edges:
    if mplx[0].has_edge(*edge):
        fixed_unobserved_edges[edge] = 1
    else:
        fixed_unobserved_edges[edge] = 0

In [6]:
def get_training_features(cache, observed_edges):
    # Get training labels
    Y = features.get_labels(observed_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, observed_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, observed_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, observed_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, observed_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

def get_testing_features(cache, unobserved_edges):
    # Get testing labels
    Y = features.get_labels(unobserved_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, unobserved_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, unobserved_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, unobserved_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, unobserved_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

In [7]:
X_train, Y_train = get_training_features(cache, fixed_observed_edges)
X_test, Y_test = get_testing_features(cache, fixed_unobserved_edges)

In [8]:
model = lr.LogReg("LogReg", ("deg", "emb"), dict(), X_train, Y_train, LOGREG)

In [9]:
print(model.testing_performance(X_test, Y_test, "AUROC"))

0.719195472186454


# Sweep

In [10]:
def get_data(cache_fp):
    # Get caches
    with open(cache_fp, 'rb') as _fh:
        cache = pickle.load(_fh)

    # Get original edgelists
    edgelists_fp = f"../../data/input/SYSLFR/edgelists/edgelists{cache_fp.split('edgelists')[1]}"
    with open(edgelists_fp, 'rb') as _fh:
        edgelists = pickle.load(_fh)

    # Fix edge labels
    ## Training edges
    fixed_observed_edges = {}
    for edge in cache.observed_edges:
        if edgelists[0].has_edge(*edge):
            fixed_observed_edges[edge] = 1
        else:
            fixed_observed_edges[edge] = 0
    cache.observed_edges = fixed_observed_edges

    ## Testing edges
    fixed_unobserved_edges = {}
    for edge in cache.unobserved_edges:
        if edgelists[0].has_edge(*edge):
            fixed_unobserved_edges[edge] = 1
        else:
            fixed_unobserved_edges[edge] = 0
    cache.unobserved_edges = fixed_unobserved_edges

    return cache

In [11]:
def get_training_features(cache, observed_edges):
    # Get training labels
    Y = features.get_labels(observed_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, observed_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, observed_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, observed_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, observed_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

def get_testing_features(cache, unobserved_edges):
    # Get testing labels
    Y = features.get_labels(unobserved_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, unobserved_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, unobserved_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, unobserved_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, unobserved_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

In [15]:
def evaluate(cache):
    # ! DEBUG
    record = {
        "accuracy": None,
        "auroc": None,
        "pr": None
    }

    try:
        # Set up features
        cache.embeddings[0].normalize(helpers.get_components(cache.remnants[0].remnant))
        cache.embeddings[1].normalize(helpers.get_components(cache.remnants[1].remnant))
        X_train, Y_train = get_training_features(cache, cache.observed_edges)
        X_test, Y_test = get_testing_features(cache, cache.unobserved_edges)

        # Train model
        model = lr.LogReg("LogReg", ("deg", "emb"), dict(), X_train, Y_train, LOGREG)

        # Evaluate reconstruction(s)
        acc = model.testing_performance(X_test, Y_test, "ACC")
        auroc = model.testing_performance(X_test, Y_test, "AUROC")
        pr = model.testing_performance(X_test, Y_test, "PR")

        # Form output record
        record.update({
            "accuracy": acc,
            "auroc": auroc,
            "pr": pr
        })
    except KeyError as err:
        print(f"KeyError!!! {err}")
        raise err
    finally:
        # Return
        return record

In [16]:
def analysis():
    records = []  # dicts of records
    dir_ = os.listdir("../../data/input/SYSLFR/caches/")  # where to find data
    idx_total = len(dir_)  # how many reconstructions to do
    for idx, cache_fp in enumerate(dir_):
        # Print status
        print(f"{idx} / {idx_total}")

        # Bring cache into scope
        cache_fp = "../../data/input/SYSLFR/caches/" + cache_fp
        cache = get_data(cache_fp)

        # Evaluate cache
        record = evaluate(cache)

        # Append parameter information
        record.update({
            "method": cache_fp.split("method")[1].split("_")[0][1:],
            "theta": cache_fp.split("theta")[1].split("_")[0][1:],
            "mu": cache_fp.split("mu")[1].split("_")[0][1:],
        })

        records.append(record)

    # Save to disk
    df = pd.DataFrame.from_records(records)
    df.to_csv("../../data/output/dataframes/dataframe_EMB_exSYSLFR.csv", index=False)

In [17]:
analysis()

0 / 537
KeyError!!! 0
1 / 537
KeyError!!! 0
2 / 537
KeyError!!! 0
3 / 537
4 / 537
5 / 537
KeyError!!! 0
6 / 537
KeyError!!! 0
7 / 537
KeyError!!! 0
8 / 537
KeyError!!! 0
9 / 537
KeyError!!! 0
10 / 537
KeyError!!! 0
11 / 537
12 / 537
KeyError!!! 0
13 / 537
14 / 537
KeyError!!! 0
15 / 537
16 / 537
KeyError!!! 0
17 / 537
18 / 537
19 / 537
20 / 537
KeyError!!! 0
21 / 537
22 / 537
KeyError!!! 0
23 / 537
KeyError!!! 0
24 / 537
KeyError!!! 0
25 / 537
26 / 537
27 / 537
28 / 537
KeyError!!! 0
29 / 537
KeyError!!! 0
30 / 537
31 / 537
32 / 537
33 / 537
34 / 537
KeyError!!! 0
35 / 537
KeyError!!! 0
36 / 537
KeyError!!! 0
37 / 537
KeyError!!! 0
38 / 537
KeyError!!! 0
39 / 537
KeyError!!! 0
40 / 537
KeyError!!! 0
41 / 537
42 / 537
KeyError!!! 0
43 / 537
KeyError!!! 0
44 / 537
KeyError!!! 0
45 / 537
46 / 537
KeyError!!! 0
47 / 537
48 / 537
49 / 537
KeyError!!! 0
50 / 537
51 / 537
KeyError!!! 0
52 / 537
53 / 537
KeyError!!! 0
54 / 537
55 / 537
56 / 537
57 / 537
58 / 537
59 / 537
60 / 537
61 / 537
KeyE

# Recreate prior manuscript