In [27]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

cv20_df = pd.read_csv('ttest_20CV.csv')
cv20_ACC_df = cv20_df.iloc[:, 1::2]
cv20_AUC_df = cv20_df.iloc[:, 2::2]


# 1-tailed two sample welch t-test code

Algorithm 1 and 2 leave one site out CV results random variables $Y, X$. Let $Y_i, X_i$ be the 20 fold CV results with $i = 1,\ldots,20$. 

Let the population means of the two algorithms be $\mu_1$ and $\mu_2$ respectively. Then the test for statistical significance between distribution means is:

$\mathbf{H_0}: \mu_1 = \mu_2$
$\mathbf{H_a}: \mu_1 \neq \mu_2$ 

Test statistic T:

$$
T=\frac{\overline{Y}_{1}-\overline{Y}_{2}}{\sqrt{s_{1}^{2} / N_{1}+s_{2}^{2} / N_{2}}}
$$

$
\begin{array}{l}{\text { where } N_{1} \text { and } N_{2} \text { are the sample sizes, } \overline{Y}_{1} \text { and } \overline{Y}_{2} \text { are the sample means, and } s_{1}^{2}} {\text { and } s_{2}^{2} \text { are the sample variances. }}\end{array}
$

The test statistic is compared with the student t distribution with $N1 + N2 - 2$ degrees of freedom to obtain a p-value. 

In [28]:
norm_ttest = pd.DataFrame(columns=['model', 'MIDA', 'Raw', 'Parisot et al (1035)', 'Parisot et al (871)'])

# TP MIDA Ridge 1035 subjects
a = cv20_ACC_df['TP MIDA Ridge (ACC) (1035)']
b = cv20_ACC_df['TP raw Ridge (ACC) (1035)']
c = cv20_ACC_df['Parisot et al (2018) (ACC) (1035)']
d = cv20_ACC_df['Parisot et al (2018)   (ACC) (871)']

norm_ttest.loc[0] =['TP MIDA Ridge (1035)', 'NA',
                    ttest_ind(a, b, equal_var = False)[1]/2,
                    ttest_ind(a, c, equal_var = False)[1]/2,
                    ttest_ind(a, d, equal_var = False)[1]/2]

# TP raw Ridge 1035 subjects
a = cv20_ACC_df['TP raw Ridge (ACC) (1035)']
b = cv20_ACC_df['TP raw Ridge (ACC) (1035)']
c = cv20_ACC_df['Parisot et al (2018) (ACC) (1035)']
d = cv20_ACC_df['Parisot et al (2018)   (ACC) (871)']

norm_ttest.loc[1] =['TP raw Ridge (1035)', 'NA',
                    'NA',
                    ttest_ind(a, c, equal_var = False)[1]/2,
                    ttest_ind(a, d, equal_var = False)[1]/2]

# TP MIDA LR 871 subjects
a = cv20_ACC_df['TP MIDA LR (ACC) (871)']
b = cv20_ACC_df['TP raw LR (ACC) (871)']
c = cv20_ACC_df['Parisot et al (2018) (ACC) (1035)']
d = cv20_ACC_df['Parisot et al (2018)   (ACC) (871)']

norm_ttest.loc[2] =['TP MIDA LR (871)', 'NA',
                    ttest_ind(a, b, equal_var = False)[1]/2,
                    ttest_ind(a, c, equal_var = False)[1]/2,
                    ttest_ind(a, d, equal_var = False)[1]/2]

# TP raw LR 871 subjects
a = cv20_ACC_df['TP raw LR (ACC) (871)']
b = cv20_ACC_df['TP raw LR (ACC) (871)']
c = cv20_ACC_df['Parisot et al (2018) (ACC) (1035)']
d = cv20_ACC_df['Parisot et al (2018)   (ACC) (871)']

norm_ttest.loc[3] =['TP raw Ridge (871)', 'NA',
                    'NA',
                    ttest_ind(a, c, equal_var = False)[1]/2,
                    ttest_ind(a, d, equal_var = False)[1]/2]

## Notes

For the p-value results that follow, the following should be noted:

- The p-values are generated from comparing two models accuracy in individual folds
- Comparing performance between different sample sizes (1035, 871) is not valid since the data in individual folds is different. The p-values are given nonetheless.
- Comparison against *Raw* (see p value tables below) refers to the Raw version of the model in the model column

Lastly, the "corrected resampled t-test" methods are not included for leave one site out evaluation since the training/test samples differ greatly from fold to fold.

## Average leave-one-site out results

In [29]:
result_summary = pd.DataFrame(cv20_ACC_df.mean(axis = 0))
result_summary.columns = ['ACC']
result_summary['AUC'] = list(cv20_ACC_df.mean(axis = 0))
result_summary = result_summary
result_summary

Unnamed: 0,ACC,AUC
TP MIDA Ridge (ACC) (1035),0.709354,0.709354
TP raw Ridge (ACC) (1035),0.685462,0.685462
TP MIDA LR (ACC) (871),0.687849,0.687849
TP raw LR (ACC) (871),0.681414,0.681414
Parisot et al (2018) (ACC) (1035),0.682928,0.682928
Parisot et al (2018) (ACC) (871),0.683711,0.683711


### 1-tailed two sample welch t-test p values


In [30]:
norm_ttest

Unnamed: 0,model,MIDA,Raw,Parisot et al (1035),Parisot et al (871)
0,TP MIDA Ridge (1035),,0.201155,0.115821,0.132552
1,TP raw Ridge (1035),,,0.460802,0.473706
2,TP MIDA LR (871),,0.405025,0.409571,0.426988
3,TP raw Ridge (871),,,0.475119,0.463564
