In [2]:
import pandas as pd
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
import pdb
import scipy
from scipy.optimize import minimize, fmin
from scipy.stats import multivariate_normal
import xlrd
from tqdm.notebook import tqdm
import matplotlib
from mpl_toolkits import mplot3d
import pingouin as pg
from statsmodels.stats.anova import AnovaRM

In [3]:
"""
What is the difference between wide and long data formats?
"""

wide_data = pg.read_dataset('rm_anova_wide')
long_data = pg.read_dataset('rm_anova')
long_data

Unnamed: 0,Subject,Gender,Region,Education,DesireToKill,Disgustingness,Frighteningness
0,1,Female,North,some,10.0,High,High
1,1,Female,North,some,9.0,High,Low
2,1,Female,North,some,6.0,Low,High
3,1,Female,North,some,6.0,Low,Low
4,2,Female,North,advance,10.0,High,High
...,...,...,...,...,...,...,...
367,99,Female,North,some,9.0,Low,Low
368,100,Female,Europe,some,3.0,High,High
369,100,Female,Europe,some,2.0,High,Low
370,100,Female,Europe,some,5.5,Low,High


In [4]:
"""
Creation of dataframe for anova analysis
"""
xls = pd.ExcelFile('clusterResultsForSubsampledData.xls')
numberOfSubjects = 56
performanceInPresenceOfDistractors = pd.read_excel(xls,'PerformanceVsNumDistractors',nrows=numberOfSubjects)
df = pd.DataFrame(columns=['Subject','Category','Distractors','Accuracy'])
df['Accuracy'] = pd.concat([performanceInPresenceOfDistractors['LowCategoryNoDistractors'],
                            performanceInPresenceOfDistractors['LowCategoryOneDistractor'],
                            performanceInPresenceOfDistractors['LowCategoryTwoDistractors'],
                            performanceInPresenceOfDistractors['LowCategoryThreeDistractors'],
                            performanceInPresenceOfDistractors['HighCategoryNoDistractors'],
                            performanceInPresenceOfDistractors['HighCategoryOneDistractor'],
                            performanceInPresenceOfDistractors['HighCategoryTwoDistractors'],
                            performanceInPresenceOfDistractors['HighCategoryThreeDistractors']])
df['Distractors'] = ([0]*numberOfSubjects + [1]*numberOfSubjects + [2]*numberOfSubjects + [3]*numberOfSubjects)*2
df['Category'] = [0]*numberOfSubjects*4 + [1]*numberOfSubjects*4
subjectList = []
for isubj in range(1,numberOfSubjects+1):
    subjectList += [isubj]
df['Subject'] = subjectList*8
df.index += 1
df

Unnamed: 0,Subject,Category,Distractors,Accuracy
1,1,0,0,93.16
2,2,0,0,90.16
3,3,0,0,90.91
4,4,0,0,89.34
5,5,0,0,72.95
...,...,...,...,...
52,52,1,3,40.00
53,53,1,3,80.00
54,54,1,3,70.00
55,55,1,3,50.00


In [5]:
"""
Anova analysis for effect of number of distractors and trial category on accuracy
"""
aov = pg.rm_anova(dv='Accuracy',
                  within=['Category', 'Distractors'],
                  subject='Subject', data=df, effsize='np2')
print(aov)
print(AnovaRM(data=df, depvar='Accuracy', subject='Subject', within=['Category','Distractors']).fit())


                   Source            SS  ddof1  ddof2            MS  \
0                Category   3959.726808      1     55   3959.726808   
1             Distractors  62633.175090      3    165  20877.725030   
2  Category * Distractors   6845.507519      3    165   2281.835840   

            F         p-unc     p-GG-corr       np2       eps  
0   23.604834  1.022952e-05  1.022952e-05  0.300297  1.000000  
1  228.586226  1.584358e-58  7.259899e-28  0.806055  0.451523  
2   36.903924  2.649753e-18  1.576914e-10  0.401549  0.512899  
                       Anova
                     F Value  Num DF  Den DF  Pr > F
----------------------------------------------------
Category              23.6048 1.0000  55.0000 0.0000
Distractors          228.5862 3.0000 165.0000 0.0000
Category:Distractors  36.9039 3.0000 165.0000 0.0000



In [6]:
"""
Posthoc analysis first using paired ttest, and then using wilcoxon sign-rank test
"""
bonferroniPVal = 0.05/4
TtestNoDistractors = scipy.stats.ttest_rel(performanceInPresenceOfDistractors['LowCategoryNoDistractors'],
                                           performanceInPresenceOfDistractors['HighCategoryNoDistractors'])
TtestOneDistractor = scipy.stats.ttest_rel(performanceInPresenceOfDistractors['LowCategoryOneDistractor'],
                                           performanceInPresenceOfDistractors['HighCategoryOneDistractor'])
TtestTwoDistractors = scipy.stats.ttest_rel(performanceInPresenceOfDistractors['LowCategoryTwoDistractors'],
                                           performanceInPresenceOfDistractors['HighCategoryTwoDistractors'])
TtestThreeDistractors = scipy.stats.ttest_rel(performanceInPresenceOfDistractors['LowCategoryThreeDistractors'],
                                            performanceInPresenceOfDistractors['HighCategoryThreeDistractors'])
TtestNoDistractors, TtestOneDistractor, TtestTwoDistractors, TtestThreeDistractors

print("Median low category no distractors",np.median(performanceInPresenceOfDistractors['LowCategoryNoDistractors']))
print("IQR low category no distractors",scipy.stats.iqr(performanceInPresenceOfDistractors['LowCategoryNoDistractors']))
print("Median high category no distractors",np.median(performanceInPresenceOfDistractors['HighCategoryNoDistractors']))
print("IQR high category no distractors",scipy.stats.iqr(performanceInPresenceOfDistractors['HighCategoryNoDistractors']))
print("wilcoxon test for no distractors",pg.wilcoxon(performanceInPresenceOfDistractors['LowCategoryNoDistractors'],
                                                     performanceInPresenceOfDistractors['HighCategoryNoDistractors']))
print("Median low category one distractor",np.median(performanceInPresenceOfDistractors['LowCategoryOneDistractor']))
print("IQR low category one distractor",scipy.stats.iqr(performanceInPresenceOfDistractors['LowCategoryOneDistractor']))
print("Median high category one distractor",np.median(performanceInPresenceOfDistractors['HighCategoryOneDistractor']))
print("IQR high category one distractor",scipy.stats.iqr(performanceInPresenceOfDistractors['HighCategoryOneDistractor']))
print("wilcoxon test for one distractor",pg.wilcoxon(performanceInPresenceOfDistractors['LowCategoryOneDistractor'],
                                                     performanceInPresenceOfDistractors['HighCategoryOneDistractor']))
print("Median low category two distractors",np.median(performanceInPresenceOfDistractors['LowCategoryTwoDistractors']))
print("IQR low category two distractors",scipy.stats.iqr(performanceInPresenceOfDistractors['LowCategoryTwoDistractors']))
print("Median high category two distractors",np.median(performanceInPresenceOfDistractors['HighCategoryTwoDistractors']))
print("IQR high category two distractors",scipy.stats.iqr(performanceInPresenceOfDistractors['HighCategoryTwoDistractors']))
print("wilcoxon test for two distractors",pg.wilcoxon(performanceInPresenceOfDistractors['LowCategoryTwoDistractors'],
                                                     performanceInPresenceOfDistractors['HighCategoryTwoDistractors']))
print("Median low category three distractors",np.median(performanceInPresenceOfDistractors['LowCategoryThreeDistractors']))
print("IQR low category three distractors",scipy.stats.iqr(performanceInPresenceOfDistractors['LowCategoryThreeDistractors']))
print("Median high category three distractors",np.median(performanceInPresenceOfDistractors['HighCategoryThreeDistractors']))
print("IQR high category three distractors",scipy.stats.iqr(performanceInPresenceOfDistractors['HighCategoryThreeDistractors']))
print("wilcoxon test for three distractors",pg.wilcoxon(performanceInPresenceOfDistractors['LowCategoryThreeDistractors'],
                                                     performanceInPresenceOfDistractors['HighCategoryThreeDistractors']))

"""
Bonferroni corrections for the wilcoxon tests
"""
print( pg.multicomp(pvals=[0.618768,0.546092,0.046552,1.911729e-07],method='fdr_bh'))


Median low category no distractors 89.34
IQR low category no distractors 5.107500000000002
Median high category no distractors 89.66
IQR high category no distractors 8.97250000000001
wilcoxon test for no distractors           W-val alternative     p-val       RBC      CLES
Wilcoxon  736.5   two-sided  0.618768 -0.077068  0.481505
Median low category one distractor 79.625
IQR low category one distractor 11.002499999999998
Median high category one distractor 79.7
IQR high category one distractor 11.724999999999994
wilcoxon test for one distractor           W-val alternative     p-val       RBC      CLES
Wilcoxon  723.5   two-sided  0.546092 -0.093358  0.491869
Median low category two distractors 65.585
IQR low category two distractors 8.599999999999994
Median high category two distractors 67.895
IQR high category two distractors 5.372500000000002
wilcoxon test for two distractors           W-val alternative     p-val       RBC      CLES
Wilcoxon  553.5   two-sided  0.046552 -0.306391  0.

In [None]:
computedLikelihoods = pd.read_excel(xls,'NoContextModelFits')
pbackCalcuatedOnSubsampledDataset = computedLikelihoods['pback'].values
pbackCalcuatedOnSubsampledDataset = pbackCalcuatedOnSubsampledDataset[~numpy.isnan(pbackCalcuatedOnSubsampledDataset)]

posteriorNorm = computedLikelihoods['normOfPosterior'].values
posteriorNorm = posteriorNorm[~numpy.isnan(posteriorNorm)]

performance = pd.read_excel(xls,'Strategy_PerformanceAccuracies')
OneIrrelevantToneAccuracy = performance['OneDistractor'].values
OneIrrelevantToneAccuracy = OneIrrelevantToneAccuracy[~numpy.isnan(OneIrrelevantToneAccuracy)]
TwoIrrelevantTonesAccuracy = performance['TwoDistractors'].values
TwoIrrelevantTonesAccuracy = TwoIrrelevantTonesAccuracy[~numpy.isnan(TwoIrrelevantTonesAccuracy)]

pback_df = pd.DataFrame(columns=['Subject','OneDistractorAccuracy','TwoDistractorsAccuracy','Pback'])
pback_df['Subject'] = list(np.arange(53))
pback_df['OneDistractorAccuracy'] = list(OneIrrelevantToneAccuracy)
pback_df['TwoDistractorsAccuracy'] = list(TwoIrrelevantTonesAccuracy)
pback_df['Pback'] = pbackCalcuatedOnSubsampledDataset[0::2]

"""
Anova analysis
"""
aov = pg.rm_anova(dv='Pback',
                  within=['OneDistractorAccuracy', 'TwoDistractorsAccuracy'],
                  subject='Subject', data=pback_df)
print(aov)
print(AnovaRM(data=pback_df, depvar='Pback', subject='Subject', 
              within=['OneDistractorAccuracy','TwoDistractorsAccuracy']).fit())
