# Statistical Analysis

## Imports

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.stats import f_oneway
from scipy.stats import ttest_rel

## Load in Files

In [52]:
dir_path = "files/PO_FirstTest"

# Test Results
cacheprof = pd.read_csv(f"{dir_path}/cacheprof-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])
sorting = pd.read_csv(f"{dir_path}/sorting-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])
maillist = pd.read_csv(f"{dir_path}/maillist-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])
hidden = pd.read_csv(f"{dir_path}/hidden-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])

# Control Group (O2 In Default Order)
cacheprof_default = pd.read_csv(f"{dir_path}/cacheprof-PHASEORDER-Iterative-BIGRIO-0.csv", index_col=[0])
hidden_default = pd.read_csv(f"{dir_path}/hidden-PHASEORDER-Iterative-PHASEORDERRIO-0.csv", index_col=[0])
maillist_default =  pd.read_csv(f"{dir_path}/maillist-PHASEORDER-Iterative-PHASEORDERRIO-0.csv", index_col=[0])
sorting_default = pd.read_csv(f"{dir_path}/sorting-PHASEORDER-Iterative-PHASEORDERRIO-0.csv", index_col=[0])

## Statistics

In [53]:
alpha = 0.05

### ANOVA

### Mann-Whitney U Test

### t-Test

#### Transform the Data into one DF

In [54]:
# Cacheprof
cacheprof_new = cacheprof.drop(["ID","Mode","Elapsed_Time"], axis=1)
cacheprof_new['Phase'] = cacheprof_new.index.map(lambda x: f'Phase-{x}')
#cacheprof_new.loc[len(cacheprof_new)] = ["DEFAULT", float(cacheprof_default.columns[0])]

# Maillist
maillist_new = maillist.drop(["ID","Mode","Elapsed_Time"], axis=1)
maillist_new['Phase'] = maillist_new.index.map(lambda x: f'Phase-{x}')
#maillist_new.loc[len(maillist_new)] = ["DEFAULT", float(maillist_default.columns[0])]

# Hidden
hidden_new = hidden.drop(["ID","Mode","Elapsed_Time"], axis=1)
hidden_new['Phase'] = hidden_new.index.map(lambda x: f'Phase-{x}')
#hidden_new.loc[len(hidden_new)] = ["DEFAULT", float(hidden_default.columns[0])]

# Sorting
sorting_new = sorting.drop(["ID","Mode","Elapsed_Time"], axis=1)
sorting_new['Phase'] = sorting_new.index.map(lambda x: f'Phase-{x}')
#sorting_new.loc[len(sorting_new)] = ["DEFAULT", float(sorting_default.columns[0])]

tables = {"cacheprof": cacheprof_new, "maillist": maillist_new, "hidden": hidden_new, "sorting": sorting_new}
O2_table = {"cacheprof": cacheprof_default, "maillist": maillist_default, "hidden": hidden_default, "sorting": sorting_default}

In [55]:
result_list = []
for t_name, t in tables.items():
    #default_value = t[t['Phase'] == 'DEFAULT']["Runtime"].values[0]
    #t = t.drop(t.index[-1])
    #lst = []
    #for i in range(0, t.shape[0]):
    #    lst.append(default_value)
    t_statistic, p_value = ttest_rel(O2_table[t_name]["Runtime"].transpose(), t["Runtime"].transpose())  # Paired t-test
    #t_statistic, p_value = ttest_rel([default_value], [t["Runtime"].mean()])  # Paired t-test
    result_list.append((t_name, t_statistic, p_value))

print("Results:")
for t in result_list:
    print(f'{t[0]}:\n    t-stat: {t[1]}\n    p_value: {t[2]}\n')

    

Results:
cacheprof:
    t-stat: 21.01272885185273
    p_value: 7.777800637539709e-61

maillist:
    t-stat: -4.250380448576844
    p_value: 2.8552522243960524e-05

hidden:
    t-stat: 3.6466385037886795
    p_value: 0.0003133119865201059

sorting:
    t-stat: 18.33846741235785
    p_value: 7.381768243770834e-51

