In [1]:
import os
import sys
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table

repo_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(repo_root)

from src import *

In [2]:
process_directory_real_data(os.getcwd())

In [38]:


def process_results(base_path):
    # Read the summary CSV files
    log_likelihood_df = pd.read_csv(os.path.join(base_path, 'log_likelihood_summary.csv')).groupby(by=['dataset']).mean().reset_index()
    leadership_log_likelihood_df = pd.read_csv(os.path.join(base_path, 'leadership_log_likelihood_summary.csv')).groupby(by=['dataset']).mean().reset_index()

    filtered_log_like = log_likelihood_df.drop(columns=['rep'])
    filtered_leader_like = leadership_log_likelihood_df.drop(columns=['rep'])
    
    log_like = subtract_columns(filtered_log_like, 3)
    leadership_log = subtract_columns(filtered_leader_like, 4)

    return log_like, leadership_log

def subtract_columns(df, compared_col):
    columns = df.columns.tolist()
    base_column = columns[compared_col] 
    print(base_column)
    for col in columns[1:]:
        
        df[col] = df[col] - df[base_column] 
    
    return df


def visualize_df(df, title):
    print(f"\n{title}\n")
    styled_df = df.style.set_table_styles(
    ).set_properties(**{
        'background-color': 'LightGray',
        'color': 'black',
        'border-color': 'black',
        'border-style': 'solid',
        'border-width': '1px',
        'text-align': 'left'
    })
    

    # print(df.to_latex(escape=True))
    display(styled_df)
   

In [39]:
unique_datasets = pd.read_csv(os.path.join(os.path.join(os.getcwd(), 'results'), 'log_likelihood_summary.csv'))['dataset'].unique()
dataset_info = pd.read_csv(os.path.join(repo_root, 'datasets', 'dataset_info.csv'))
filtered_dataset_info = dataset_info[dataset_info['dataset_id'].isin(unique_datasets)].set_index('dataset_id')

domain_order = ["Election", "Sport", "Preferences", "Other"]
filtered_dataset_info['Domain'] = pd.Categorical(filtered_dataset_info['Domain'], categories=domain_order, ordered=True)

filtered_dataset_info = filtered_dataset_info.sort_values(by=['Domain', 'K1', 'K2', 'Name'])

visualize_df(filtered_dataset_info, 'Datasets Tested')



Datasets Tested



Unnamed: 0_level_0,Domain,Name,N,M,K1,K2
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18,Election,minneapolis_election,778,137482,1,3
28,Election,APA_election,6,203896,1,5
30,Election,UK_labor_party_vote,6,266,1,5
2,Election,debian,36,3352,1,9
8,Election,glasgow_city_council,209,188376,1,13
1,Election,irish_election,36,138011,1,14
7,Election,electoral_reform_society,30,35610,1,29
42,Sport,boxing,466,5594,1,17
53,Sport,formula_1_races,133,48909,2,24
52,Sport,formula_1_seasons,851,1814,5,42


In [40]:

log_like, leadership_log = process_results(os.path.join(os.getcwd(), 'results'))

def rename_df(df):
    merged_df = df.merge(dataset_info, left_on='dataset', right_on='dataset_id').drop(columns=['dataset_id','dataset'])
    # merged_df = merged_df.rename(columns={'Name': 'dataset_name'}).set_index('dataset_name')

    domain_order = ["Election", "Sport", "Preferences", "Other"]
    merged_df['Domain'] = pd.Categorical(merged_df['Domain'], categories=domain_order, ordered=True)
    merged_df = merged_df.sort_values(by=['Domain', 'K1', 'K2', 'Name'])
    merged_df = merged_df[['Domain', 'Name', 'BT','BT_leadership', 'HO_BT', 'HOL_BT', 'Spring_Rank', 'Spring_Rank_Leadership', 'Page_Rank','Page_Rank_Leadership', 'Point_Wise']]

    return merged_df




HO_BT
HOL_BT


In [41]:

log_like = rename_df(log_like)
visualize_df(log_like, 'log likelihood')


log likelihood



Unnamed: 0,Domain,Name,BT,BT_leadership,HO_BT,HOL_BT,Spring_Rank,Spring_Rank_Leadership,Page_Rank,Page_Rank_Leadership,Point_Wise
9,Election,minneapolis_election,-0.000136,-0.003551,0.0,-1.431002,-2.724622,-3.009524,-1.504351,-1.489426,-1.472443
12,Election,APA_election,-0.000615,-0.010187,0.0,-3.696233,-15.158253,-16.318663,-3.694714,-3.691636,-3.688787
13,Election,UK_labor_party_vote,0.0001,-0.034704,0.0,-2.522971,-4.183342,-4.162102,-3.213421,-2.843707,-2.660403
1,Election,debian,-0.008915,-0.408689,0.0,-5.00634,-5.472282,-5.48438,-6.966629,-6.064407,-5.010912
5,Election,glasgow_city_council,-0.003652,-0.109396,0.0,-3.264148,-3.279654,-3.505544,-3.271568,-3.204819,-3.198903
0,Election,irish_election,-0.008041,-0.133923,0.0,-5.134975,-5.621829,-5.714833,-5.188384,-5.068554,-5.074011
4,Election,electoral_reform_society,-0.002058,-0.027856,0.0,-6.204386,-6.326116,-6.405955,-6.245926,-6.283521,-6.194234
18,Sport,boxing,-0.776388,-15.958274,0.0,-32.96455,-23.25292,-29.293757,-31.779291,-26.343215,-23.154032
21,Sport,formula_1_races,-0.231963,-3.725668,0.0,-37.013182,-33.811274,-34.921095,-35.57807,-35.004869,-33.883398
20,Sport,formula_1_seasons,-0.445682,-21.698527,0.0,-55.38079,-46.893995,-52.937809,-52.616277,-51.385117,-47.728179


In [42]:
leadership_log = rename_df(leadership_log)
visualize_df(leadership_log, 'Leadership log likelihood')


Leadership log likelihood



Unnamed: 0,Domain,Name,BT,BT_leadership,HO_BT,HOL_BT,Spring_Rank,Spring_Rank_Leadership,Page_Rank,Page_Rank_Leadership,Point_Wise
9,Election,minneapolis_election,-0.00162,-4.2e-05,-0.002381,0.0,-1.506308,-1.61334,-0.988904,-0.975362,-0.967787
12,Election,APA_election,-0.002081,-3.7e-05,-0.003257,0.0,-5.352857,-4.91671,-1.389462,-1.389348,-1.389118
13,Election,UK_labor_party_vote,-0.006064,-0.00256,-0.005793,0.0,-1.692973,-1.688167,-1.309144,-1.215953,-1.182755
1,Election,debian,-0.026573,-0.001732,-0.036802,0.0,-1.381645,-1.290442,-1.999228,-1.713059,-1.405247
5,Election,glasgow_city_council,-0.031092,-0.00175,-0.032355,0.0,-1.159652,-1.144617,-1.229169,-1.195559,-1.175134
0,Election,irish_election,-0.018463,-0.001567,-0.020148,0.0,-1.432779,-1.418821,-1.470794,-1.439461,-1.431625
4,Election,electoral_reform_society,-0.001894,-0.001012,-0.003101,0.0,-1.542364,-1.543564,-1.537421,-1.539804,-1.527696
18,Sport,boxing,-0.544847,0.02579,-0.711864,0.0,-2.108178,-1.420375,-3.486371,-2.672779,-2.20996
21,Sport,formula_1_races,-0.109925,-0.002609,-0.174792,0.0,-2.614063,-2.480185,-2.823321,-2.627125,-2.647196
20,Sport,formula_1_seasons,-0.376847,0.009267,-0.503928,0.0,-2.611404,-2.355405,-3.177009,-3.087855,-2.75478
