## Statistics

In [1]:
# imports
import pandas as pd; pd.set_option('display.max_columns', None)
import scipy.stats
from tqdm.notebook import tqdm
import pingouin as pg
import warnings; warnings.filterwarnings('ignore')

#### Phonetic Clustering Score

1-sample t-test against chance (H = 0, J = 0.5).  FDR correction (Benjamini-Hochberg).

In [2]:
def pcs_statistics(pcs_H_data_bsa, pcs_HS_data_bsa, pcs_HR_data_bsa, pcs_J_data_bsa, pcs_JFL_data_bsa):
    stats = []

    # H
    res = scipy.stats.ttest_1samp(pcs_H_data_bsa.pcs, popmean=0, nan_policy='omit', alternative='two-sided')
    stats.append(('H', res.df, res.statistic, res.pvalue))

    # HS
    res = scipy.stats.ttest_1samp(pcs_HS_data_bsa.pcs, popmean=0, nan_policy='omit', alternative='two-sided')
    stats.append(('HS', res.df, res.statistic, res.pvalue))

    # HR
    res = scipy.stats.ttest_1samp(pcs_HR_data_bsa.pcs, popmean=0, nan_policy='omit', alternative='two-sided')
    stats.append(('HR', res.df, res.statistic, res.pvalue))

    res = scipy.stats.ttest_1samp(pcs_J_data_bsa.pcs, popmean=0.5, nan_policy='omit', alternative='two-sided')
    stats.append(('J', res.df, res.statistic, res.pvalue))

    res = scipy.stats.ttest_1samp(pcs_JFL_data_bsa.pcs, popmean=0.5, nan_policy='omit', alternative='two-sided')
    stats.append(('JFL', res.df, res.statistic, res.pvalue))
    
    # save as dataframe
    stats = pd.DataFrame(stats, columns=['metric', 'dof', 't_stat', 'p_val'])

    # FDR correction
    stats['p_val_fdr'] = scipy.stats.false_discovery_control(stats.p_val, method='bh')
    
    return stats

In [3]:
pcs_H_data_bsa = pd.read_csv('analyses/dataframes/pcs_H_data_bsa.csv')
pcs_HS_data_bsa = pd.read_csv('analyses/dataframes/pcs_HS_data_bsa.csv')
pcs_HR_data_bsa = pd.read_csv('analyses/dataframes/pcs_HR_data_bsa.csv')
pcs_J_data_bsa = pd.read_csv('analyses/dataframes/pcs_J_data_bsa.csv')
pcs_JFL_data_bsa = pd.read_csv('analyses/dataframes/pcs_JFL_data_bsa.csv')

pcs_stats = pcs_statistics(pcs_H_data_bsa, pcs_HS_data_bsa, pcs_HR_data_bsa, pcs_J_data_bsa, pcs_JFL_data_bsa)
pcs_stats.to_csv('statistics/dataframes/pcs_stats.csv', index=False)
pcs_stats

Unnamed: 0,metric,dof,t_stat,p_val,p_val_fdr
0,H,126,10.466725,7.854941999999999e-19,1.963735e-18
1,HS,126,7.373297,1.947343e-11,3.245572e-11
2,HR,126,10.575273,4.2540999999999995e-19,1.963735e-18
3,J,126,0.102761,0.9183157,0.9183157
4,JFL,126,3.370176,0.0009974874,0.001246859


#### Temporal and Semantic Clustering Scores

1-sample t-test against chance (0.5).

In [4]:
def cs_statistics(cs_data_bsa, cs):
    res = scipy.stats.ttest_1samp(cs_data_bsa[cs], popmean=0.5, nan_policy='omit', alternative='two-sided')
    
    return pd.DataFrame([(cs, res.df, res.statistic, res.pvalue)], columns=['score', 'dof', 't_stat', 'p_val'])

In [5]:
tcs_data_bsa = pd.read_csv('analyses/dataframes/tcs_data_bsa.csv')
tcs_stats = cs_statistics(tcs_data_bsa, 'tcs')
tcs_stats.to_csv('statistics/dataframes/tcs_stats.csv', index=False)
tcs_stats

Unnamed: 0,score,dof,t_stat,p_val
0,tcs,126,30.158262,1.76578e-59


In [6]:
scs_data_bsa = pd.read_csv('analyses/dataframes/scs_data_bsa.csv')
scs_stats = cs_statistics(scs_data_bsa, 'scs')
scs_stats.to_csv('statistics/dataframes/scs_stats.csv', index=False)
scs_stats

Unnamed: 0,score,dof,t_stat,p_val
0,scs,126,24.916531,1.5869949999999999e-50


#### Phonetic Intrusions (Word List)

1-sample t-test against 0 with two-sided alternative hypothesis for the (PLI/ELI - PLI/ELI control).  Paired t-test with two-sided alternative hypothesis (ELI - PLI).  FDR correction (Benjamini-Hochberg).  

In [9]:
def psim_intr_l_statistics(psim_intr_l_H_data_bsa, psim_intr_l_J_data_bsa):
    stats = []
    
    # H
    res_pli = scipy.stats.ttest_1samp(psim_intr_l_H_data_bsa.pli_delta, 0, nan_policy='omit', alternative='two-sided')
    res_eli = scipy.stats.ttest_1samp(psim_intr_l_H_data_bsa.eli_delta, 0, nan_policy='omit', alternative='two-sided')
    res_intr = scipy.stats.ttest_rel(psim_intr_l_H_data_bsa.eli_psim, psim_intr_l_H_data_bsa.pli_psim, nan_policy='omit', alternative='two-sided')
    
    stats.append(('H', 'pli', res_pli.df, res_pli.statistic, res_pli.pvalue))
    stats.append(('H', 'eli', res_eli.df, res_eli.statistic, res_eli.pvalue))
    stats.append(('H', 'intr', res_intr.df, res_intr.statistic, res_intr.pvalue))
    
    # J
    res_pli = scipy.stats.ttest_1samp(psim_intr_l_J_data_bsa.pli_delta, 0, nan_policy='omit', alternative='two-sided')
    res_eli = scipy.stats.ttest_1samp(psim_intr_l_J_data_bsa.eli_delta, 0, nan_policy='omit', alternative='two-sided')
    res_intr = scipy.stats.ttest_rel(psim_intr_l_J_data_bsa.eli_psim, psim_intr_l_J_data_bsa.pli_psim, nan_policy='omit', alternative='two-sided')
    
    stats.append(('J', 'pli', res_pli.df, res_pli.statistic, res_pli.pvalue))
    stats.append(('J', 'eli', res_eli.df, res_eli.statistic, res_eli.pvalue))
    stats.append(('J', 'intr', res_intr.df, res_intr.statistic, res_intr.pvalue))
    
    # save as dataframe
    stats = pd.DataFrame(stats, columns=['metric', 'comparison', 'dof', 't_stat', 'p_val'])
    
    # FDR correction
    stats['p_val_fdr'] = scipy.stats.false_discovery_control(stats.p_val, method='bh')
    
    return stats

In [10]:
psim_intr_l_H_data_bsa = pd.read_csv('analyses/dataframes/psim_intr_l_H_data_bsa.csv')
psim_intr_l_J_data_bsa = pd.read_csv('analyses/dataframes/psim_intr_l_J_data_bsa.csv')
psim_intr_l_stats = psim_intr_l_statistics(psim_intr_l_H_data_bsa, psim_intr_l_J_data_bsa)
psim_intr_l_stats.to_csv('statistics/dataframes/psim_intr_l_stats.csv', index=False)
psim_intr_l_stats

Unnamed: 0,metric,comparison,dof,t_stat,p_val,p_val_fdr
0,H,pli,126,7.206218,4.657379e-11,9.314758e-11
1,H,eli,126,14.625708,6.263121e-29,1.878936e-28
2,H,intr,126,4.577638,1.112144e-05,1.668217e-05
3,J,pli,126,4.374155,2.528546e-05,3.034255e-05
4,J,eli,126,15.452771,7.176527000000001e-31,4.305916e-30
5,J,intr,126,1.913268,0.05798253,0.05798253


#### Phonetic Intrusions (Prior Recall)

Repeated measures anova (CR, PLI, ELI). Subsequent pairwise tests (CR-PLI, CR-ELI, ELI-PLI) with FDR correction (Benjamini-Hochberg).

In [11]:
# repeated measure anova
def psim_intr_r_rm_anova(psim_intr_r_data_bsa):
    # only subjects with data in all 3 conditions
    subs_balanced = []
    for sub, data in psim_intr_r_data_bsa.groupby(['subject']):
        if all([x in data.resp_type.unique() for x in ['cr', 'pli', 'eli']]):
            subs_balanced.append(sub[0])

    df = psim_intr_r_data_bsa[psim_intr_r_data_bsa.subject.isin(subs_balanced)].query("resp_type != 'control'")
    anova_results = pg.rm_anova(data=df, dv='psim', within='resp_type', subject='subject')
    return anova_results, df

In [16]:
psim_intr_r_H_data_bsa = pd.read_csv('analyses/dataframes/psim_intr_r_H_data_bsa.csv')

aovH, dfH = psim_intr_r_rm_anova(psim_intr_r_H_data_bsa)
aovH.to_csv('statistics/dataframes/psim_intr_r_H_anova.csv', index=False)
aovH

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,p-GG-corr,ng2,eps,sphericity,W-spher,p-spher
0,resp_type,2,252,2.182309,0.114908,0.129743,0.011439,0.738451,False,0.645815,1.354383e-12


In [19]:
psim_intr_r_J_data_bsa = pd.read_csv('analyses/dataframes/psim_intr_r_J_data_bsa.csv')

aovJ, dfJ = psim_intr_r_rm_anova(psim_intr_r_J_data_bsa)
aovJ.to_csv('statistics/dataframes/psim_intr_r_J_anova.csv', index=False)
aovJ

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,ng2,eps
0,resp_type,2,252,3.868143,0.022149,0.017595,0.90609


In [20]:
pairwiseJ = pg.pairwise_tests(dfJ, dv='psim', within='resp_type', subject='subject', padjust='fdr_bh')
pairwiseJ.to_csv('statistics/dataframes/psim_intr_r_J_pairwise.csv', index=False)
pairwiseJ

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,resp_type,cr,eli,True,True,-2.365367,126.0,two-sided,0.019538,0.038911,fdr_bh,1.438,-0.292637
1,resp_type,cr,pli,True,True,0.459654,126.0,two-sided,0.646557,0.646557,fdr_bh,0.109,0.056558
2,resp_type,eli,pli,True,True,2.253741,126.0,two-sided,0.02594,0.038911,fdr_bh,1.128,0.248465
