In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import subprocess
import shlex
import numpy as np
import pandas as pd

import os
import sys

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

repo_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(repo_root)

from sklearn.model_selection import train_test_split
from src.utils.file_handlers import group_dataset_files, read_dataset_files
from src.utils.operation_helpers import run_models, split_games
from src.utils.c_operation_helpers import  run_simulation
from src.utils.file_handlers import process_directory_real_data
from src.models.BradleyTerry import compute_predicted_ratings_HOL_BT

### Example

Python

In [3]:
data_dir = os.path.join(repo_root, 'datasets', 'Real_Data')
grouped = group_dataset_files(data_dir)

data, pi_values = read_dataset_files(grouped['00102'], data_dir, is_synthetic=False)

train, test = train_test_split(data, train_size=.8)
# train, test = split_games(data, .8)

df = run_models(train, test, pi_values)
print(df)



         model  log-likelihoods  leadership-log-likelihood
0        HO_BT        -1.338036                  -0.772709
1       HOL_BT        -1.349191                  -0.796616
2          BIN        -1.360202                  -0.784017
3         BINL        -1.373624                  -0.810707
4  Spring_Rank        -1.379134                  -0.821936
5    Page_Rank        -2.305316                  -1.344802
6   Point_Wise        -1.802803                  -0.972088


C 

In [4]:
data_dir = os.path.join(repo_root, 'datasets', 'Real_Data')
grouped = group_dataset_files(data_dir)

edge_file = grouped['00102']['edges']
node_file = grouped['00102']['nodes']

edge_path = os.path.join(data_dir, edge_file)
node_path = os.path.join(data_dir, node_file)

results = run_simulation(node_path, edge_path,ratio=.8, is_synthetic=0)
print(results)

  model  av_error    spearman     kendall     prior   HO_Like   HOL_Like  \
0    HO  0.336678   0.0612518   0.0440919  -1.55447  -1.33392  -0.747891   
1   HOL   0.32949  0.00292864  0.00416723  -1.50343  -1.37821  -0.767421   
2   BIN  0.346041   0.0495149   0.0415378  -1.58948  -1.32607  -0.738008   
3  BINL  0.345981  -0.0245976  -0.0180132  -1.55388  -1.37372  -0.762296   

  iterations  
0         12  
1         15  
2         12  
3         13  


### Full Results

In [5]:
EXPERIMENT_PATH = os.path.join(repo_root, 'exp', 'ex04')
os.chdir(EXPERIMENT_PATH)

In [6]:
print(os.getcwd())

/home/jackyeung99/senior_thesis/higher_order_ranking/exp/ex04


In [7]:
process_directory_real_data(EXPERIMENT_PATH)

In [8]:


def process_results():
    # Read the summary CSV files
    log_likelihood_df = pd.read_csv(os.path.join(EXPERIMENT_PATH, 'results', 'log_likelihood_summary.csv')).groupby(by=['dataset']).mean().reset_index()
    leadership_log_likelihood_df = pd.read_csv(os.path.join(EXPERIMENT_PATH, 'results', 'leadership_log_likelihood_summary.csv')).groupby(by=['dataset']).mean().reset_index()

    filtered_log_like = log_likelihood_df.drop(columns=['epoch'])
    filtered_leader_like = leadership_log_likelihood_df.drop(columns=['epoch'])
   
    log_like = subtract_columns(filtered_log_like, 'HO_BT')
    leadership_log = subtract_columns(filtered_leader_like, 'HOL_BT')


    return log_like, leadership_log

def subtract_columns(df, compared_col):
    columns = df.columns.tolist()
    # base_column = columns[[compared_col]] 
    for col in columns[1:]:
        if col != compared_col:
            df[col] = df[col] - df[compared_col] 
    df[[compared_col]] = 0.00
    return df


def visualize_df(df, title):
    print(f"\n{title}\n")
    styled_df = df.style.set_table_styles(
    ).set_properties(**{
        'background-color': 'LightGray',
        'color': 'black',
        'border-color': 'black',
        'border-style': 'solid',
        'border-width': '1px',
        'text-align': 'left'
    })
    

    # print(df.to_latex(escape=True))
    display(styled_df)
   

In [9]:


unique_datasets = pd.read_csv(os.path.join(os.getcwd(), 'results', 'log_likelihood_summary.csv'))['dataset'].unique()
dataset_info = pd.read_csv(os.path.join(repo_root, 'datasets', 'dataset_info.csv'))
filtered_dataset_info = dataset_info[dataset_info['Dataset_ID'].isin(unique_datasets)].set_index('Dataset_ID')

# print(dataset_info)
# domain_order = ["Election", "Sport", "Preferences", "Other"]
# filtered_dataset_info['Domain'] = pd.Categorical(filtered_dataset_info['Domain'], categories=domain_order, ordered=True)

filtered_dataset_info = filtered_dataset_info.sort_values(by=['K1', 'K2', 'R', 'Name'])
visualize_df(filtered_dataset_info, 'Datasets Tested')


Datasets Tested



Unnamed: 0_level_0,Name,N,M,R,K1,K2,K_avg
Dataset_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
106,letor,118,15592,132.136,2,83,9.156
9,AGH_course_selection,9,299,33.222,7,9,7.977


In [10]:
log_like, leadership_log = process_results()

def rename_df(df):
    merged_df = df.merge(dataset_info, left_on='dataset', right_on='Dataset_ID').drop(columns=['Dataset_ID','dataset'])
    # merged_df = merged_df.rename(columns={'Name': 'dataset_name'}).set_index('dataset_name')

    domain_order = ["Election", "Sport", "Preferences", "Other"]
    # merged_df['Domain'] = pd.Categorical(merged_df['Domain'], categories=domain_order, ordered=True)
    merged_df = merged_df.sort_values(by=['K1', 'K2', 'Name'])
    merged_df = merged_df[[ 'Name', 'HO_BT','HOL_BT', 'BIN', 'BINL', 'Spring_Rank', 'Page_Rank', 'Point_Wise']]

    return merged_df

In [11]:
print(log_like)

   dataset  HO_BT      HOL_BT       BIN        BINL  Spring_Rank  Page_Rank  \
0        9    0.0  -37.318206  0.172597  -37.579035   -38.727814 -18.441606   
1      106    0.0 -174.314853  0.203217 -182.318222    -1.954173  -6.148737   

   Point_Wise  
0   -6.905505  
1   -2.573224  


In [12]:

log_like = rename_df(log_like)
visualize_df(log_like, 'log likelihood')


log likelihood



Unnamed: 0,Name,HO_BT,HOL_BT,BIN,BINL,Spring_Rank,Page_Rank,Point_Wise
1,letor,0.0,-174.314853,0.203217,-182.318222,-1.954173,-6.148737,-2.573224
0,AGH_course_selection,0.0,-37.318206,0.172597,-37.579035,-38.727814,-18.441606,-6.905505


In [13]:
leadership_log = rename_df(leadership_log)
visualize_df(leadership_log, 'Leadership log likelihood')


Leadership log likelihood



Unnamed: 0,Name,HO_BT,HOL_BT,BIN,BINL,Spring_Rank,Page_Rank,Point_Wise
1,letor,-0.773957,0.0,-0.65966,8.9e-05,-0.886226,-1.542278,-1.174041
0,AGH_course_selection,-0.839617,0.0,-0.763094,0.000229,-1.327989,-2.998331,-1.662018


In [14]:
log_likelihood_df = (
    pd.read_csv(os.path.join(EXPERIMENT_PATH, 'results', 'log_likelihood_summary.csv'))
    .groupby(by=['dataset'])
    .mean()
    .reset_index()
    .drop(columns=['epoch'])
)

log_likelihood_df.style.highlight_max(axis=1, color='grey', subset=log_likelihood_df.columns.difference(['dataset']))

Unnamed: 0,dataset,HO_BT,HOL_BT,BIN,BINL,Spring_Rank,Page_Rank,Point_Wise
0,9,-39.17738,-76.495586,-39.004783,-76.756415,-77.905194,-57.618986,-46.082885
1,106,-248.103393,-422.418245,-247.900176,-430.421615,-250.057565,-254.25213,-250.676617


In [15]:
log_likelihood_df = (
    pd.read_csv(os.path.join(EXPERIMENT_PATH, 'results', 'leadership_log_likelihood_summary.csv'))
    .groupby(by=['dataset'])
    .mean()
    .reset_index()
    .drop(columns=['epoch'])
)

log_likelihood_df.style.highlight_max(axis=1, color='grey', subset=log_likelihood_df.columns.difference(['dataset']))

Unnamed: 0,dataset,HO_BT,HOL_BT,BIN,BINL,Spring_Rank,Page_Rank,Point_Wise
0,9,-0.854252,-0.014635,-0.777729,-0.014406,-1.342624,-3.012966,-1.676653
1,106,-1.131794,-0.357837,-1.017497,-0.357747,-1.244063,-1.900115,-1.531877
