In [1]:
import pickle
import os
import sys
import pandas as pd

from tqdm.notebook import tqdm

sys.path.append("../../")
sys.path.append("../../src/")
from src.data.caches import CachedEmbeddings
from src.embed import helpers
from src.classifiers import logreg as lr
from src.classifiers import features

In [2]:
LOGREG = {
    "penalty": "none",
    "solver": "newton-cg",
}

# Once-thru

In [3]:
with open("../../data/input/caches/method-LE_percomponent-False_dim-128_embrep-0_remnants_theta-0.05_strategy-RANDOM_remrep-1_edgelists_name-LFR_N-10000_T1-2.1_T2-1.0_kavg-20.0-kmax-100_mu-0.1_prob-1.0_rep-1.pkl", "rb") as _fh:
    cache = pickle.load(_fh)
with open("../../data/input/data_input_manuscript_initial/edgelists_name-LFR_N-10000_T1-2.1_T2-1.0_kavg-20.0-kmax-100_mu-0.1_prob-1.0_rep-1.pkl", "rb") as _fh:
    mplx = pickle.load(_fh)

In [3]:
fixed_observed_edges = {}
for edge in cache.observed_edges:
    if mplx[0].has_edge(*edge):
        fixed_observed_edges[edge] = 1
    else:
        fixed_observed_edges[edge] = 0

fixed_unobserved_edges = {}
for edge in cache.unobserved_edges:
    if mplx[0].has_edge(*edge):
        fixed_unobserved_edges[edge] = 1
    else:
        fixed_unobserved_edges[edge] = 0

In [4]:
def get_training_features(cache, observed_edges):
    # Get training labels
    Y = features.get_labels(observed_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, observed_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, observed_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, observed_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, observed_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

def get_testing_features(cache, unobserved_edges):
    # Get testing labels
    Y = features.get_labels(unobserved_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, unobserved_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, unobserved_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, unobserved_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, unobserved_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

In [6]:
X_train, Y_train = get_training_features(cache, fixed_observed_edges)
X_test, Y_test = get_testing_features(cache, fixed_unobserved_edges)

In [7]:
model = lr.LogReg("LogReg", ("deg", "emb"), dict(), X_train, Y_train, LOGREG)

In [8]:
print(model.testing_performance(X_test, Y_test, "AUROC"))

0.6895269156463264


# Sweep

In [3]:
def get_data(cache_fp):
    # Get caches
    with open(cache_fp, 'rb') as _fh:
        cache = pickle.load(_fh)

    # Get original edgelists
    edgelists_fp = f"../../data/input/data_input_manuscript_initial/edgelists{cache_fp.split('edgelists')[1]}"
    with open(edgelists_fp, 'rb') as _fh:
        edgelists = pickle.load(_fh)

    # Fix edge labels
    ## Training edges
    fixed_observed_edges = {}
    for edge in cache.observed_edges:
        if edgelists[0].has_edge(*edge):
            fixed_observed_edges[edge] = 1
        else:
            fixed_observed_edges[edge] = 0
    cache.observed_edges = fixed_observed_edges

    ## Testing edges
    fixed_unobserved_edges = {}
    for edge in cache.unobserved_edges:
        if edgelists[0].has_edge(*edge):
            fixed_unobserved_edges[edge] = 1
        else:
            fixed_unobserved_edges[edge] = 0
    cache.unobserved_edges = fixed_unobserved_edges

    return cache

In [4]:
def get_training_features(cache, observed_edges):
    # Get training labels
    Y = features.get_labels(observed_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, observed_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, observed_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, observed_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, observed_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

def get_testing_features(cache, unobserved_edges):
    # Get testing labels
    Y = features.get_labels(unobserved_edges)

    # Degree feature
    src_G, tgt_G = features.get_degrees(cache.remnants[0].remnant, unobserved_edges)
    src_H, tgt_H = features.get_degrees(cache.remnants[1].remnant, unobserved_edges)
    degree_products_G = src_G * tgt_G
    degree_products_H = src_H * tgt_H
    X_degrees = features.as_configuration(degree_products_G, degree_products_H)

    # Distances feature
    distances_G = features.get_distances(cache.embeddings[0].vectors, unobserved_edges)
    distances_H = features.get_distances(cache.embeddings[1].vectors, unobserved_edges)
    X_distances = features.as_configuration(distances_G, distances_H)

    # Feature matrix
    X = features.format_feature_matrix((X_degrees, X_distances))

    return X, Y

In [5]:
def evaluate(cache):
    # Set up features
    cache.embeddings[0].normalize(helpers.get_components(cache.remnants[0].remnant))
    cache.embeddings[1].normalize(helpers.get_components(cache.remnants[1].remnant))
    X_train, Y_train = get_training_features(cache, cache.observed_edges)
    X_test, Y_test = get_testing_features(cache, cache.unobserved_edges)

    # Train model
    model = lr.LogReg("LogReg", ("deg", "emb"), dict(), X_train, Y_train, LOGREG)

    # Evaluate reconstruction(s)
    acc = model.testing_performance(X_test, Y_test, "ACC")
    auroc = model.testing_performance(X_test, Y_test, "AUROC")
    pr = model.testing_performance(X_test, Y_test, "PR")

    # Form output record
    record = {
        "accuracy": acc,
        "auroc": auroc,
        "pr": pr
    }

    # Return
    return record

In [6]:
def analysis():
    records = []
    for cache_fp in tqdm(os.listdir("../../data/input/caches/")):
        # Bring cache into scope
        # print(cache_fp)
        cache_fp = "../../data/input/caches/" + cache_fp
        cache = get_data(cache_fp)

        # Evaluate cache
        record = evaluate(cache)

        # Append parameter information
        record.update({
            "method": cache_fp.split("method")[1].split("_")[0][1:],
            "theta": cache_fp.split("theta")[1].split("_")[0][1:],
            "mu": cache_fp.split("mu")[1].split("_")[0][1:],
        })

        records.append(record)

    # Save to disk
    df = pd.DataFrame.from_records(records)
    df.to_csv("../../data/output/dataframes/dataframe_EMB_exSYSLFR_normalized.csv", index=False)

In [7]:
analysis()

  0%|          | 0/57 [00:00<?, ?it/s]

# Recreate prior manuscript