In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib


repo_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(repo_root)

from src.utils.operation_helpers import  run_models, run_models_synthetic
from src.utils.file_handlers import group_dataset_files, read_dataset_files
from sklearn.model_selection import train_test_split

from datasets.utils.dataset_info import DATASET_NAMES

In [2]:
EPOCHS = 25

### Synthetic Data

In [3]:
data_dir = os.path.join(repo_root, 'datasets', 'Synthetic_Data')

filein_idx = 'N-1000_M-1000_K-4_L-0_nodes.txt'
filein_data = 'N-1000_M-1000_K-4_L-0_edges.txt'

dataset_files = {
    'nodes':filein_idx, 
    'edges': filein_data
    }

results = []

for epoch in range(EPOCHS):
    data, pi_values = read_dataset_files(dataset_files, data_dir, is_synthetic=True)

    train, test = train_test_split(data, train_size=.8)

    df = run_models_synthetic(train, test, pi_values)
    results.append(df)


combined_results = pd.concat(results, ignore_index=True)
synthetic_results = combined_results.groupby('model').mean()



The model with the best performance of each metric is highlighted below 

In [4]:
synthetic_results.style \
    .highlight_max(axis=0, color='grey', subset=synthetic_results.columns.difference(['model', 'rms'])) \
    .highlight_min(axis=0, color='grey', subset=['rms'])

Unnamed: 0_level_0,log-likelihood,leadership-log-likelihood,rms,rho,tau
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BIN,-2.593485,-1.047179,1.154609,0.807836,0.613058
BINL,-2.826422,-1.140048,1.412159,0.652121,0.465089
HOL_BT,-2.78374,-1.131478,1.445461,0.652545,0.465065
HO_BT,-2.561068,-1.040643,1.178892,0.811518,0.616666
Page_Rank,-5.723532,-2.535399,7.410955,-0.617337,-0.444531
Point_Wise,-5.060018,-1.920569,4.683257,0.507011,0.374851
Spring_Rank,-2.909683,-1.250679,1.755217,0.781013,0.57704


### Real Data

Example of running all models on a real dataset, to change the dataset identify it by indexing the dataset grouping 

In [5]:
data_dir = os.path.join(repo_root, 'datasets', 'Real_Data')
grouped = group_dataset_files(data_dir)

dataset_id = '00010'
data, pi_values = read_dataset_files(grouped[dataset_id], data_dir, is_synthetic=False)

print(DATASET_NAMES[dataset_id])
results = []
for epoch in range(EPOCHS):

    train, test = train_test_split(data, train_size=.8)

    df = run_models(train, test, pi_values)
    results.append(df)


combined_results = pd.concat(results, ignore_index=True)
real_results = combined_results.groupby('model').mean()


spotify_daily


The model with the best performance of each metric is highlighted below 

In [6]:
real_results.style.highlight_max(axis=0, color='grey', subset=real_results.columns.difference(['model']))

Unnamed: 0_level_0,log-likelihood,leadership-log-likelihood
model,Unnamed: 1_level_1,Unnamed: 2_level_1
BIN,-38.020891,-1.972046
BINL,-51.147078,-1.603614
HOL_BT,-45.52199,-1.702428
HO_BT,-36.978694,-2.164199
Page_Rank,-47.166024,-3.359481
Point_Wise,-40.637925,-2.593196
Spring_Rank,-48.623119,-2.458852
