# Statistic Test for AUC
To assess whether the AUC values obtained from my four- or five-fold cross-validation are significantly higher than random guessing (with a one-tailed t-test and a desired p-value < 0.05).<br>
Null hypothesis (H0): the performance of my model is not significantly different from random guessing. 
<br>Alternative hypothesis (Ha): the performance is significantly higher than random guessing.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [3]:
# Change current directory to the folder containing your CSV file
os.chdir('/content/drive/MyDrive/_Masterarbeit/results')

# Verify the current directory
print(os.getcwd())

/content/drive/MyDrive/_Masterarbeit/results


In [4]:
ls -l

total 564
drwx------ 2 root root   4096 May 19 06:40 [0m[01;34mresults[0m/
-rw------- 1 root root 510029 Jun  1 07:42 results_ROC_curves.ipynb
-rw------- 1 root root  30832 May 31 08:55 results_statistic_test2.ipynb
-rw------- 1 root root  30832 May 20 17:50 results_statistic_test.ipynb


In [5]:
from scipy.stats import ttest_ind
def ttest(results_for_one_model, random_guessing):
  """In your code, ttest_ind is used to compare the performance scores of two 
  different models (results_for_one_model and random_guessing). It assesses 
  whether the mean performance of one model is significantly different from the 
  mean performance of another model (random guessing).
  """
  p_value = 0.05
  #random_guessing = 0.5
  tt_test_results = ttest_ind(results_for_one_model, random_guessing)
  cur_p_value = tt_test_results[1]
  if cur_p_value < p_value:
      #print('result is significant')
      return 1
  else:
      #print('NO SIGNIFICANCE DETECT')
      return 0

In [6]:
from scipy.stats import ttest_1samp
def ttest_2(performance_scores):
  """compare the mean performance scores (performance_scores) against a baseline 
  value (e.g., random guessing). It assesses whether the mean performance 
  significantly deviates from the baseline value."""
  baseline_value = 0.5  # The baseline value for random guessing
  alpha = 0.05  # Desired significance level

  t_statistic, p_value = ttest_1samp(performance_scores, baseline_value)

  if p_value < alpha:
    return 1
    #print("Results are significantly higher than random guessing.")
  else:
    return 0
    #print("No significant difference from random guessing.")

# Subject and Page evaluation settings (5 folds)

In [7]:
# get data for subj and book page evaluation settings: 5 folds
# gets path, results and data needed to identify which model the scores belong to
dataset = ['InDiCo', 'SBSAT']
versions = ['s1_rm1_lf1', 's1_rm1_lf0', 's1_rm1_lf1_pos_cont', 's1_rm0_lf0', 
            's0_rm1_lf0', 's0_rm0_lf1']
criteria = ['book-page', 'subj']

all_data = []

for d in dataset: 
  for v in versions: 
    for c in criteria: 
      data_file = {}
      data_file['dataset'] = d
      data_file['version'] = v
      data_file['criterium'] = c
      filename = d.lower() + "_splits_" + v + "_" + c + "_binary_score_64_tanh.csv"
      path = os.path.join("results", d, v, filename)
      data_file['path'] = path
      df = pd.read_csv(path)
      # results of the different folds
      data_file['results'] = df[["fold0_auc","fold1_auc","fold2_auc",
                                 "fold3_auc","fold4_auc"]].values.tolist()[0]
      all_data.append(data_file)

print(all_data)

[{'dataset': 'InDiCo', 'version': 's1_rm1_lf1', 'criterium': 'book-page', 'path': 'results/InDiCo/s1_rm1_lf1/indico_splits_s1_rm1_lf1_book-page_binary_score_64_tanh.csv', 'results': [0.5172525651774664, 0.4990773115773116, 0.551340383669454, 0.5231220392510716, 0.5072198222354636]}, {'dataset': 'InDiCo', 'version': 's1_rm1_lf1', 'criterium': 'subj', 'path': 'results/InDiCo/s1_rm1_lf1/indico_splits_s1_rm1_lf1_subj_binary_score_64_tanh.csv', 'results': [0.5166736535313611, 0.5060508935508936, 0.4915867355304148, 0.5103423925303309, 0.5508565469983614]}, {'dataset': 'InDiCo', 'version': 's1_rm1_lf0', 'criterium': 'book-page', 'path': 'results/InDiCo/s1_rm1_lf0/indico_splits_s1_rm1_lf0_book-page_binary_score_64_tanh.csv', 'results': [0.5092276520142132, 0.5593240093240093, 0.5152330300049188, 0.503275826557734, 0.5127315159640499]}, {'dataset': 'InDiCo', 'version': 's1_rm1_lf0', 'criterium': 'subj', 'path': 'results/InDiCo/s1_rm1_lf0/indico_splits_s1_rm1_lf0_subj_binary_score_64_tanh.csv',

In [8]:
pd.read_csv(all_data[0]['path'])

Unnamed: 0,ahn_baseline,fold0_auc,fold1_auc,fold2_auc,fold3_auc,fold4_auc,fold0_tpr,fold1_tpr,fold2_tpr,fold3_tpr,...,fold2_y_pred,fold3_y_pred,fold4_y_pred,fold0_y_test,fold1_y_test,fold2_y_test,fold3_y_test,fold4_y_test,avg_auc,std_auc
0,nn_laura,0.517253,0.499077,0.55134,0.523122,0.50722,[0. 0.00413223 0.00826446 0.00826446 0...,[0. 0. 0.00384615 0.00384615 0...,[0. 0. 0.00438596 0.00438596 0...,[0. 0. 0. 0.00395257 0...,...,[[0.98806363]\n [0.40452462]\n [0.963297 ]\n ...,[[0.38164088]\n [0.38164088]\n [0.38164088]\n ...,[[0.29683265]\n [0.11949039]\n [0.29683265]\n ...,[0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0...,[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0...,[0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0...,[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0...,[0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0...,0.519602,0.017888


In [9]:
# significance test for subject and page settings

random_guessing = [0.5, 0.5, 0.5, 0.5, 0.5] # 5 folds
significant_results = []

print("Are the model result significant or not?")

for d in all_data: 
  decision = ttest_2(d['results'])
  if decision == 1: 
    significant_results.append((d['dataset'], d['version'], d['criterium']))
    print(d['dataset'], ",", d['version'], ",",  d['criterium'],": ", "\tresult is significant")
  elif decision == 0: 
    print(d['dataset'], ", ", d['version'], ", ", d['criterium'], ": ", '\tNO SIGNIFICANCE DETECT')

Are the model result significant or not?
InDiCo ,  s1_rm1_lf1 ,  book-page :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm1_lf1 ,  subj :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm1_lf0 ,  book-page :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm1_lf0 ,  subj :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm1_lf1_pos_cont ,  book-page :  	NO SIGNIFICANCE DETECT
InDiCo , s1_rm1_lf1_pos_cont , subj :  	result is significant
InDiCo ,  s1_rm0_lf0 ,  book-page :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm0_lf0 ,  subj :  	NO SIGNIFICANCE DETECT
InDiCo ,  s0_rm1_lf0 ,  book-page :  	NO SIGNIFICANCE DETECT
InDiCo ,  s0_rm1_lf0 ,  subj :  	NO SIGNIFICANCE DETECT
InDiCo ,  s0_rm0_lf1 ,  book-page :  	NO SIGNIFICANCE DETECT
InDiCo ,  s0_rm0_lf1 ,  subj :  	NO SIGNIFICANCE DETECT
SBSAT , s1_rm1_lf1 , book-page :  	result is significant
SBSAT , s1_rm1_lf1 , subj :  	result is significant
SBSAT ,  s1_rm1_lf0 ,  book-page :  	NO SIGNIFICANCE DETECT
SBSAT , s1_rm1_lf0 , subj :  	result is significant
SBSAT , s1_rm1_lf1_po

# Book evaluation setting (4 folds)

In [10]:
# get paths (subj and book page: 5 pages)
# example: SBSAT/s1_rm1_lf1/sbsat_splits_s1_rm1_lf1_book-page_binary_score_64_tanh.csv
dataset = ['InDiCo', 'SBSAT']
versions = ['s1_rm1_lf1', 's1_rm1_lf0', 's1_rm1_lf1_pos_cont', 's1_rm0_lf0', 's0_rm1_lf0', 's0_rm0_lf1']
criteria = ['book']

all_data_book = []

for d in dataset: 
  for v in versions: 
    for c in criteria:
      data_file = {}
      data_file['dataset'] = d
      data_file['version'] = v
      filename = d.lower() + "_splits_" + v + "_" + c + "_binary_score_64_tanh.csv"
      path = os.path.join("results", d, v, filename)
      data_file['path'] = path
      df = pd.read_csv(path)
      data_file['results'] = df[["fold0_auc","fold1_auc","fold2_auc","fold3_auc"]].values.tolist()[0]
      all_data_book.append(data_file)
  
print(all_data_book)

[{'dataset': 'InDiCo', 'version': 's1_rm1_lf1', 'path': 'results/InDiCo/s1_rm1_lf1/indico_splits_s1_rm1_lf1_book_binary_score_64_tanh.csv', 'results': [0.49956130312047, 0.5169334424301311, 0.5178368121442125, 0.4896774193548386]}, {'dataset': 'InDiCo', 'version': 's1_rm1_lf0', 'path': 'results/InDiCo/s1_rm1_lf0/indico_splits_s1_rm1_lf0_book_binary_score_64_tanh.csv', 'results': [0.4749434144604664, 0.5523536781814928, 0.5005249841872232, 0.4900822264389627]}, {'dataset': 'InDiCo', 'version': 's1_rm1_lf1_pos_cont', 'path': 'results/InDiCo/s1_rm1_lf1_pos_cont/indico_splits_s1_rm1_lf1_pos_cont_book_binary_score_64_tanh.csv', 'results': [0.511088222578266, 0.5246874121046307, 0.4862365591397848, 0.5143516761543327]}, {'dataset': 'InDiCo', 'version': 's1_rm0_lf0', 'path': 'results/InDiCo/s1_rm0_lf0/indico_splits_s1_rm0_lf0_book_binary_score_64_tanh.csv', 'results': [0.5235942626077669, 0.53608504436319, 0.4937824161922833, 0.5119165085388994]}, {'dataset': 'InDiCo', 'version': 's0_rm1_lf0'

In [11]:
random_guessing = [0.5, 0.5, 0.5, 0.5] # 4 folds

print("Model result significant or not?")
for d in all_data_book: 
  decision = ttest_2(d['results'])
  if decision == 1: 
    significant_results.append((d['dataset'], d['version'], 'book'))
    print(d['dataset'], ",", d['version'], ",",  'book',": ", "\tresult is significant")
  elif decision == 0: 
    print(d['dataset'], ", ", d['version'], ", ", 'book', ": ", '\tNO SIGNIFICANCE DETECT')


Model result significant or not?
InDiCo ,  s1_rm1_lf1 ,  book :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm1_lf0 ,  book :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm1_lf1_pos_cont ,  book :  	NO SIGNIFICANCE DETECT
InDiCo ,  s1_rm0_lf0 ,  book :  	NO SIGNIFICANCE DETECT
InDiCo ,  s0_rm1_lf0 ,  book :  	NO SIGNIFICANCE DETECT
InDiCo ,  s0_rm0_lf1 ,  book :  	NO SIGNIFICANCE DETECT
SBSAT ,  s1_rm1_lf1 ,  book :  	NO SIGNIFICANCE DETECT
SBSAT ,  s1_rm1_lf0 ,  book :  	NO SIGNIFICANCE DETECT
SBSAT , s1_rm1_lf1_pos_cont , book :  	result is significant
SBSAT , s1_rm0_lf0 , book :  	result is significant
SBSAT ,  s0_rm1_lf0 ,  book :  	NO SIGNIFICANCE DETECT
SBSAT ,  s0_rm0_lf1 ,  book :  	NO SIGNIFICANCE DETECT


# Overview: Statistically significant results

In [12]:
print("\nSummary: Models where the result is significant: ")
for i in sorted(significant_results): 
  print(i)


Summary: Models where the result is significant: 
('InDiCo', 's1_rm1_lf1_pos_cont', 'subj')
('SBSAT', 's0_rm0_lf1', 'book-page')
('SBSAT', 's0_rm0_lf1', 'subj')
('SBSAT', 's0_rm1_lf0', 'book-page')
('SBSAT', 's1_rm0_lf0', 'book')
('SBSAT', 's1_rm0_lf0', 'subj')
('SBSAT', 's1_rm1_lf0', 'subj')
('SBSAT', 's1_rm1_lf1', 'book-page')
('SBSAT', 's1_rm1_lf1', 'subj')
('SBSAT', 's1_rm1_lf1_pos_cont', 'book')
('SBSAT', 's1_rm1_lf1_pos_cont', 'book-page')
