# Prereqs and config
---

In [18]:
# Standard library
import sys
import os
from enum import Enum

# Scientific computing
import numpy as np
import powerlaw
from sklearn.metrics import normalized_mutual_info_score as nmiscore

# Network science
import networkx as nx
import community as louvain

# Data management
import pandas as pd
from tabulate import tabulate

# Data viz
from IPython.display import Image
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
%matplotlib inline

# Miscellaneous
## Path adjustment
sys.path.append("..")

## Imports
from structs import core
from structs import multiplex
from utils import performance

## Aliases
read_edgelist = core.read_edgelist
Multiplex = multiplex.Multiplex
MultiplexCorpus = multiplex.MultiplexCorpus
auc = performance.get_auc

In [19]:
# Config
accepted_layers = {
    "aireu": (1, 2, 3),
    "arxiv": (2, 6, 7),
    "celegans": (1, 2, 3),
    "drosophila": (1, 2, 3, 4),
    "london": (1, 2, 3),
}

# Parametrized analysis over corpus
---

## Gathering corpus

In [20]:
# Create multiplex corpus
corpus = []

## Initializing each multiplex
## * Note: Some layers are statistically unsound (too small) and removed
for filepath in os.listdir("../data/multiplexes"):
    filepath = f"../data/multiplexes/{filepath}"
    ### Read from edgelist
    multiplex_ = read_edgelist(
        filepath=filepath, network_type="MULTIPLEX", output_type="STANDARD"
    )
    name_ = filepath.split("-")[1].split(".")[0]  # Particular for this naming convention!

    ### Restrict layers, removing small layers or those without precendence
    multiplex_ = {
        key: value for key, value in multiplex_.items()
        if key in accepted_layers[name_]
    }


    ### Instantiate as class and append to corpus
    multiplex_ = Multiplex(multiplex_dict=multiplex_)
    multiplex_.name = name_
    corpus.append(multiplex_)

## Initialize MultiplexCorpus object
corpus = MultiplexCorpus(corpus)

## Applying workflow

### Gathering summary tables

In [21]:
# Gather summary dataframes
## Initialize empty dataframes
df_layers = pd.DataFrame({})
df_pairs = pd.DataFrame({})

## Fill in each multiplexes' contribution
for multiplex_ in corpus.corpus:
    ### Gather layer-wise information
    df_ = multiplex_.summarize_multiplex_layers()
    df_["Name"] = [multiplex_.name] * len(df_)
    df_layers = pd.concat([df_layers, df_])

    ### Gather layer pair information
    df_ = multiplex_.summarize_multiplex_pairs()
    df_["Name"] = [multiplex_.name] * len(df_)
    df_pairs = pd.concat([df_pairs, df_])

## Reorder for convenience
df_layers = df_layers[["Name", "Layer", "ActiveNodeCount", "ActiveEdgesCount", "Modularity", "ComponentsCount"]]
df_pairs = df_pairs[["Name", "LayerLeft", "LayerRight", "NodeOverlap", "EdgeOverlap", "AverageDegreeRatio", "DegreeSequenceCorrelation", "NMI"]]

### Appending reconstruction performances

In [22]:
# Add performance summary to pairs dataframe
# * performance summary here is AUC of reconstruction accuracy as a function of training set size
## Read in performance dataframe
perfs = pd.read_csv("../data/accuracy_networks_concat.csv")
perfs = perfs.rename(columns={"Multiplex": "Name"})

## Prepare the x_coords of performance curve
pfis = sorted(list(set(perfs["PFI"])))

## Calculate AUCs
aucs = []
for row in df_pairs.iterrows():  # Yeah yeah, I know iterrows is not preferred.
    series_ = dict(row[1])
    y_coords = [
        np.mean(
            perfs  # Yeah yeah pt. 2, I know its inefficient queries. It's _legible_ though
            .query("Name == @series_['Name']")
            .query("LayerLeft == @series_['LayerLeft'] + 1")
            .query("LayerRight == @series_['LayerRight'] + 1")
            .query("PFI == @pfi")
            ["Accuracy"]
        )
        for pfi in pfis
    ]
    aucs.append(auc(pfis, y_coords))

## Add resultant AUCs to pairs dataframe
df_pairs["AUC"] = aucs

## Fix an indexing issue
df_pairs["LayerLeft"] = df_pairs["LayerLeft"].apply(lambda x: x + 1)
df_pairs["LayerRight"] = df_pairs["LayerRight"].apply(lambda x: x + 1)

In [23]:
# Rename layers
## * NOTE: Done at this stage to avoid issues with calculations above
def adjust_layer_names(df_line):
    aireu_ = {1: "Lufthansa", 2: "Ryanair", 3: "EasyJet"}
    arxiv_ = {1: "physics.data-an", 2: "cond-mat.dis-nn", 3: "cond-mat.stat-mech"}
    celegans_ = {1: "Electric", 2: "Chemical Monadic", 3: "Chemical Polyadic"}
    drosophila_ = {1: "Direct", 2: "Suppressive", 3: "Additive", 4: "Physical association"}
    london_ = {1: "Underground", 2: "Overground", 3: "DLR"}


    new_ = eval(
        f"{df_line['Name']}_[{df_line['Layer']}]"
    )

    return new_

def adjust_layerpairs_left(df_line):
    aireu_ = {1: "Lufthansa", 2: "Ryanair", 3: "EasyJet"}
    arxiv_ = {1: "physics.data-an", 2: "cond-mat.dis-nn", 3: "cond-mat.stat-mech"}
    celegans_ = {1: "Electric", 2: "Chemical Monadic", 3: "Chemical Polyadic"}
    drosophila_ = {1: "Direct", 2: "Suppressive", 3: "Additive", 4: "Physical association"}
    london_ = {1: "Underground", 2: "Overground", 3: "DLR"}


    new_ = eval(
        f"{df_line['Name']}_[{df_line['LayerLeft']}]"
    )

    return new_

def adjust_layerpairs_right(df_line):
    aireu_ = {1: "Lufthansa", 2: "Ryanair", 3: "EasyJet"}
    arxiv_ = {1: "physics.data-an", 2: "cond-mat.dis-nn", 3: "cond-mat.stat-mech"}
    celegans_ = {1: "Electric", 2: "Chemical Monadic", 3: "Chemical Polyadic"}
    drosophila_ = {1: "Direct", 2: "Suppressive", 3: "Additive", 4: "Physical association"}
    london_ = {1: "Underground", 2: "Overground", 3: "DLR"}


    new_ = eval(
        f"{df_line['Name']}_[{df_line['LayerRight']}]"
    )

    return new_

## Adjust layerwise summary dataframe layer names
df_layers["_Layer"] = df_layers["Layer"].copy()
df_layers["Layer"] = df_layers.apply(adjust_layer_names, axis=1)

## Adjust layerpairs summary dataframe layer names
df_pairs["_LayerLeft"] = df_pairs["LayerLeft"].copy()
df_pairs["LayerLeft"] = df_pairs.apply(adjust_layerpairs_left, axis=1)
df_pairs["_LayerRight"] = df_pairs["LayerRight"].copy()
df_pairs["LayerRight"] = df_pairs.apply(adjust_layerpairs_right, axis=1)

# Saving to file
df_layers.to_csv("structural-analysis_layerwise-measures.csv")
df_pairs.to_csv("structural-analysis_layerpairs-measures.csv")

## Visualizing potential relationships

In [24]:
# Exploring structural indicators of performance
## Sourcing data
source = df_pairs
range_ = np.linspace(0.5, 1.0, 10)
name_filter = alt.selection_multi(encodings=["color"])


## Defining charts
average_performance = alt.Chart(source).mark_bar().encode(
    alt.X('Name:N'),
    alt.Y('mean(AUC):Q', scale=alt.Scale(domain=[0.5,1.0])),
    color=alt.Color("Name:N"),
).properties(
    width=400,
    height=400
).add_selection(
    name_filter
)

performance_v_nodeoverlap = alt.Chart(source).mark_point(filled=True).encode(
    alt.X('NodeOverlap:Q'),
    alt.Y('AUC:Q', scale=alt.Scale(domain=[0.5,1.0])),
    color=alt.Color("Name:N"),
    tooltip=["LayerLeft", "LayerRight"]
).properties(
    width=400,
    height=400
)

performance_v_edgeoverlap = alt.Chart(source).mark_point(filled=True).encode(
    alt.X('EdgeOverlap:Q'),
    alt.Y('AUC:Q', scale=alt.Scale(domain=[0.5,1.0])),
    alt.Color("Name:N"),
    tooltip=["LayerLeft", "LayerRight"]
).properties(
    width=400,
    height=400
)

performance_v_ratio = alt.Chart(source).mark_point(filled=True).encode(
    alt.X('AverageDegreeRatio:Q'),
    alt.Y('AUC:Q', scale=alt.Scale(domain=[0.5,1.0])),
    alt.Color("Name:N"),
    tooltip=["LayerLeft", "LayerRight"]
).properties(
    width=400,
    height=400
)

performance_v_nmi = alt.Chart(source).mark_point(filled=True).encode(
    alt.X('NMI:Q'),
    alt.Y('AUC:Q', scale=alt.Scale(domain=[0.5,1.0])),
    alt.Color("Name:N"),
    tooltip=["LayerLeft", "LayerRight"]
).properties(
    width=400,
    height=400
)


## Concatenating and plotting charts
average_performance | alt.vconcat(
    performance_v_nodeoverlap.transform_filter(name_filter) | performance_v_edgeoverlap.transform_filter(name_filter),
    performance_v_ratio.transform_filter(name_filter) | performance_v_nmi.transform_filter(name_filter)
).resolve_axis(
    y="independent"
).resolve_scale(
    y="independent"
)

In [25]:
alt.vconcat(
    performance_v_nodeoverlap | performance_v_edgeoverlap,
    performance_v_ratio | performance_v_nmi
).resolve_axis(
    y="independent"
).resolve_scale(
    y="independent"
)

In [26]:
performance_v_edgeoverlap | performance_v_nmi