In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib


repo_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(repo_root)

from src.utils.operation_helpers import  run_models, run_models_synthetic
from src.utils.file_handlers import group_dataset_files, read_dataset_files
from sklearn.model_selection import train_test_split

In [2]:
EPOCHS = 25

### Synthetic Data

In [7]:
data_dir = os.path.join(repo_root, 'datasets', 'Synthetic_Data')

filein_idx = 'N-1000_M-1000_K-4_L-0_nodes.txt'
filein_data = 'N-1000_M-1000_K-4_L-0_edges.txt'

dataset_files = {
    'nodes':filein_idx, 
    'edges': filein_data
    }

results = []

for epoch in range(EPOCHS):
    data, pi_values = read_dataset_files(dataset_files, data_dir, is_synthetic=True)

    train, test = train_test_split(data, train_size=.8)

    df = run_models_synthetic(train, test, pi_values)
    results.append(df)


combined_results = pd.concat(results, ignore_index=True)
synthetic_results = combined_results.groupby('model').mean()



The model with the best performance of each metric is highlighted below 

In [8]:
synthetic_results.style \
    .highlight_max(axis=0, color='grey', subset=synthetic_results.columns.difference(['model', 'rms'])) \
    .highlight_min(axis=0, color='grey', subset=['rms'])

Unnamed: 0_level_0,log-likelihood,leadership-log-likelihood,rms,rho,tau
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BIN,-2.582649,-1.046121,1.154792,0.806956,0.612604
BINL,-2.815593,-1.147568,1.413777,0.648865,0.462995
HOL_BT,-2.77216,-1.136661,1.445933,0.649542,0.463216
HO_BT,-2.546598,-1.03726,1.178012,0.81112,0.616532
Page_Rank,-5.79636,-2.592346,7.40283,-0.617025,-0.444692
Point_Wise,-5.334852,-1.950544,4.787009,0.507674,0.375037
Spring_Rank,-2.894552,-1.248331,1.750876,0.781006,0.577374


### Real Data

Example of running all models on a real dataset, to change the dataset identify it by indexing the dataset grouping 

In [9]:
data_dir = os.path.join(repo_root, 'datasets', 'Real_Data')
grouped = group_dataset_files(data_dir)

data, pi_values = read_dataset_files(grouped['00103'], data_dir, is_synthetic=False)

results = []
for epoch in range(EPOCHS):

    train, test = train_test_split(data, train_size=.8)

    df = run_models(train, test, pi_values)
    results.append(df)


combined_results = pd.concat(results, ignore_index=True)
real_results = combined_results.groupby('model').mean()


The model with the best performance of each metric is highlighted below 

In [10]:
real_results.style.highlight_max(axis=0, color='grey', subset=real_results.columns.difference(['model']))

Unnamed: 0_level_0,log-likelihood,leadership-log-likelihood
model,Unnamed: 1_level_1,Unnamed: 2_level_1
BIN,-1.459292,-0.827206
BINL,-1.490292,-0.844197
HOL_BT,-1.462481,-0.837356
HO_BT,-1.441787,-0.821725
Page_Rank,-2.060026,-1.087361
Point_Wise,-1.847701,-0.924831
Spring_Rank,-1.755317,-0.935991
