# Statistical Analysis

## Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.stats import f_oneway
from scipy.stats import ttest_rel

## Load in Files

In [3]:
dir_path = "files/PO_FirstTest"

# Test Results
cacheprof = pd.read_csv(f"{dir_path}/cacheprof-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])
sorting = pd.read_csv(f"{dir_path}/sorting-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])
maillist = pd.read_csv(f"{dir_path}/maillist-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])
hidden = pd.read_csv(f"{dir_path}/hidden-PHASEORDER-Iterative-NEW_PHASEORDER_RIO-0.csv", index_col=[0])

# Control Group (O2 In Default Order)
cacheprof_default = pd.read_csv(f"{dir_path}/cacheprof-PHASEORDER-Iterative-BIGRIO-0.csv", index_col=[0])
hidden_default = pd.read_csv(f"{dir_path}/hidden-PHASEORDER-Iterative-PHASEORDERRIO-0.csv", index_col=[0])
maillist_default =  pd.read_csv(f"{dir_path}/maillist-PHASEORDER-Iterative-PHASEORDERRIO-0.csv", index_col=[0])
sorting_default = pd.read_csv(f"{dir_path}/sorting-PHASEORDER-Iterative-PHASEORDERRIO-0.csv", index_col=[0])



## Statistics

In [4]:
alpha = 0.05

### ANOVA

### Mann-Whitney U Test

### t-Test

#### Transform the Data into one DF

In [5]:
# Cacheprof
cacheprof_new = cacheprof.drop(["ID","Mode","Elapsed_Time"], axis=1)
cacheprof_new['Phase'] = cacheprof_new.index.map(lambda x: f'Phase-{x}')
#cacheprof_new.loc[len(cacheprof_new)] = ["DEFAULT", float(cacheprof_default.columns[0])]

# Maillist
maillist_new = maillist.drop(["ID","Mode","Elapsed_Time"], axis=1)
maillist_new['Phase'] = maillist_new.index.map(lambda x: f'Phase-{x}')
#maillist_new.loc[len(maillist_new)] = ["DEFAULT", float(maillist_default.columns[0])]

# Hidden
hidden_new = hidden.drop(["ID","Mode","Elapsed_Time"], axis=1)
hidden_new['Phase'] = hidden_new.index.map(lambda x: f'Phase-{x}')
#hidden_new.loc[len(hidden_new)] = ["DEFAULT", float(hidden_default.columns[0])]

# Sorting
sorting_new = sorting.drop(["ID","Mode","Elapsed_Time"], axis=1)
sorting_new['Phase'] = sorting_new.index.map(lambda x: f'Phase-{x}')
#sorting_new.loc[len(sorting_new)] = ["DEFAULT", float(sorting_default.columns[0])]


tables = {"cacheprof": cacheprof_new, "maillist": maillist_new, "hidden": hidden_new, "sorting": sorting_new}
O2_table = {"cacheprof": cacheprof_default, "maillist": maillist_default, "hidden": hidden_default, "sorting": sorting_default}

In [6]:
result_list = []
for t_name, t in tables.items():
    #default_value = t[t['Phase'] == 'DEFAULT']["Runtime"].values[0]
    #t = t.drop(t.index[-1])
    #lst = []
    #for i in range(0, t.shape[0]):
    #    lst.append(default_value)
    t_statistic, p_value = ttest_rel(O2_table[t_name]["Runtime"].transpose(), t["Runtime"].transpose())  # Paired t-test
    #t_statistic, p_value = ttest_rel([default_value], [t["Runtime"].mean()])  # Paired t-test
    result_list.append((t_name, t_statistic, p_value))

print("Results:")
for t in result_list:
    print(f'{t[0]}:\n    t-stat: {t[1]}\n    p_value: {t[2]}\n')

    

Results:
cacheprof:
    t-stat: 21.01272885185273
    p_value: 7.777800637539709e-61

maillist:
    t-stat: -4.250380448576844
    p_value: 2.8552522243960524e-05

hidden:
    t-stat: 3.6466385037886795
    p_value: 0.0003133119865201059

sorting:
    t-stat: 18.33846741235785
    p_value: 7.381768243770834e-51



In [7]:
cacheprof_new.sort_values("Runtime").head()

Unnamed: 0,Phase,Runtime
62,Phase-62,0.115
227,Phase-227,0.115
0,Phase-0,0.116
225,Phase-225,0.116
61,Phase-61,0.116


In [8]:
cacheprof_default.sort_values("Runtime").head()

Unnamed: 0,ID,Phase,Mode,Runtime,Elapsed_Time
107,c777c447-5672-4f2b-9859-92ef0ad3da7e,0|1|2|3|4|5|6|7|8|9|10|11|12|13|20|17|23|14|19...,fast,0.117,0.117
270,58f578fd-b134-41b2-aad2-3c525db93eb4,0|1|2|3|4|5|6|7|8|9|10|11|12|13|19|15|16|22|20...,fast,0.117,0.117
90,ff5c80c4-cb89-45b8-8c76-1d5fba142275,0|1|2|3|4|5|6|7|8|9|10|11|12|13|15|22|18|21|14...,fast,0.117,0.117
262,6e7e4f7a-45db-4eb9-965a-5ea23fe38996,0|1|2|3|4|5|6|7|8|9|10|11|12|13|18|21|20|16|14...,fast,0.117,0.117
260,faa9ab19-2a2b-4b28-a2d5-d4ad73438893,0|1|2|3|4|5|6|7|8|9|10|11|12|13|17|15|18|14|19...,fast,0.117,0.117


In [9]:
cacheprof.head()

Unnamed: 0,ID,Phase,Mode,Runtime,Elapsed_Time
0,37d2dd82-8f6a-45b4-8fc2-bb265943deca,0|1|2|3|4|5|6|7|8|9|10|11|12|13|21|15|20|17|23...,fast,0.116,0.116
1,021d4073-8c82-42b3-849e-a5236f7d9606,0|1|2|3|4|5|6|7|8|9|10|11|12|13|15|14|22|20|17...,fast,0.117,0.117
2,3b219f7b-270d-484a-bd56-8b0666568a0e,0|1|2|3|4|5|6|7|8|9|10|11|12|13|22|21|15|14|18...,fast,0.116,0.116
3,6e62530b-523c-4fd5-892b-d773f3e557ae,0|1|2|3|4|5|6|7|8|9|10|11|12|13|15|23|18|17|22...,fast,0.117,0.117
4,0a51412b-5d4d-49cf-b14e-d46396061a06,0|1|2|3|4|5|6|7|8|9|10|11|12|13|19|16|21|22|18...,fast,0.117,0.117


## Scatter plots of slow and fast rule