In [23]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib


repo_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(repo_root)

from src.utils.operation_helpers import  run_models, run_models_synthetic
from src.utils.file_handlers import group_dataset_files, read_dataset_files
from sklearn.model_selection import train_test_split

from datasets.utils.dataset_info import DATASET_NAMES

In [17]:
EPOCHS = 25

### Synthetic Data

In [18]:
data_dir = os.path.join(repo_root, 'datasets', 'Synthetic_Data')

filein_idx = 'N-1000_M-1000_K-4_L-0_nodes.txt'
filein_data = 'N-1000_M-1000_K-4_L-0_edges.txt'

dataset_files = {
    'nodes':filein_idx, 
    'edges': filein_data
    }

results = []

for epoch in range(EPOCHS):
    data, pi_values = read_dataset_files(dataset_files, data_dir, is_synthetic=True)

    train, test = train_test_split(data, train_size=.8)

    df = run_models_synthetic(train, test, pi_values)
    results.append(df)


combined_results = pd.concat(results, ignore_index=True)
synthetic_results = combined_results.groupby('model').mean()



The model with the best performance of each metric is highlighted below 

In [19]:
synthetic_results.style \
    .highlight_max(axis=0, color='grey', subset=synthetic_results.columns.difference(['model', 'rms'])) \
    .highlight_min(axis=0, color='grey', subset=['rms'])

Unnamed: 0_level_0,log-likelihood,leadership-log-likelihood,rms,rho,tau
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BIN,-2.573576,-1.038529,1.15645,0.807381,0.612562
BINL,-2.821392,-1.151625,1.417587,0.647909,0.461616
HOL_BT,-2.780963,-1.142081,1.449187,0.648679,0.461936
HO_BT,-2.539457,-1.029936,1.179479,0.811903,0.616959
Page_Rank,-5.697758,-2.549832,7.399124,-0.613793,-0.441951
Point_Wise,-5.432165,-2.071752,4.761177,0.50654,0.374498
Spring_Rank,-2.897395,-1.245382,1.754845,0.782889,0.578831


### Real Data

Example of running all models on a real dataset, to change the dataset identify it by indexing the dataset grouping 

In [27]:
data_dir = os.path.join(repo_root, 'datasets', 'Real_Data')
grouped = group_dataset_files(data_dir)

dataset_id = '00006'
data, pi_values = read_dataset_files(grouped[dataset_id], data_dir, is_synthetic=False)

print(DATASET_NAMES[dataset_id])
results = []
for epoch in range(EPOCHS):

    train, test = train_test_split(data, train_size=.8)

    df = run_models(train, test, pi_values)
    results.append(df)


combined_results = pd.concat(results, ignore_index=True)
real_results = combined_results.groupby('model').mean()


APA_Election_2009


The model with the best performance of each metric is highlighted below 

In [28]:
real_results.style.highlight_max(axis=0, color='grey', subset=real_results.columns.difference(['model']))

Unnamed: 0_level_0,log-likelihood,leadership-log-likelihood
model,Unnamed: 1_level_1,Unnamed: 2_level_1
BIN,-3.118905,-1.18527
BINL,-3.142206,-1.178098
HOL_BT,-3.150904,-1.177802
HO_BT,-3.118313,-1.18701
Page_Rank,-3.168556,-1.190333
Point_Wise,-3.13494,-1.1838
Spring_Rank,-11.784928,-3.877275
