In [4]:
# Preparations
def add_values_in_dict(dict, key, list_of_values):
    ''' Append multiple values to a key in
        the given dictionary '''
    if key not in dict:
        dict[key] = list_of_values
    else:
        dict[key].extend(list_of_values)
    return dict

def compare_two_proportions(n1,n2,p1,p2,alpha=0.05):
    '''Compare if there is a statistically significant difference between two proportions
        n1 - number of observations in first group
        p1 - number of specific instances in first group
        e.g. 12 students, 5 are female; n1 = 12, p1 = 5
    '''
    p_bar1 = p1 / n1
    p_bar2 = p2 / n2
    p_bar = (p1+p2) / (n1+n2)
    std_error = np.sqrt(p_bar*(1-p_bar)*(1/n1+1/n2))  # standard error
    test_statistic = (p_bar1 - p_bar2)/std_error      # test statistic
    p_value = 2*stats.norm.sf(abs(test_statistic))

    if p_value<=alpha:
        print(f'Reject H0: z-statistic={test_statistic:.3f} p-value={p_value:.4f}')

    if p_value>alpha:
        print(f'Failed to reject H0: z-statistic={test_statistic:.3f} p-value={p_value:.4f}')

    return


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hddm
import os
import kabuki
import sys
import multiprocessing
import scipy.stats as stats
import scipy.stats
import statistics
import math
import pingouin as pg

from pymer4.models import Lmer
plt.close('all')
beh_path = ('/Users/nagrodzkij/data/angry/')
input_path = ('/Users/nagrodzkij/data/angry/input/')
output_path = ('/Users/nagrodzkij/data/angry/output/')

FaceScores = pd.read_csv(input_path+'/demog/FaceScores.csv',index_col=[0])

prev_depressed = pd.read_csv(input_path+'/demog/previously_depressed.csv', names=['0'])
list_depressed = prev_depressed['0'].to_list()

demog = pd.read_csv(output_path+'table_demog_byccid.csv')

In [5]:
# To demog_by_ccid table, add categorical variable education by self-reported highest level

# 1. College or university degree or higher
# 2. A levels/AS levels or equivalent
# 3. O levels/GCSEs or equivalent
# 4. CSEs or equivalent
# 5. NVQ or HND or HNC or equivalent
# 6. Other professional qualifications e.g.: nursing, teaching
# 0. None of the above
# 8. No answer

# 0 = none, 1 = GCSE, 2 = A Levels, 3 = Uni

for i in range(len(demog)):
    quali = demog.at[i,"qualifications"]
    if '1' in quali: highest = 3
    elif '6' in quali: highest = 3
    elif '2' in quali: highest = 2
    elif '4' in quali: highest = 2
    elif '3' in quali: highest = 1

    elif '5' in quali: highest = 1
    elif '0' in quali: highest = 0

    else: highest = np.nan

    demog.at[i,"highest_qualification"]=highest

In [6]:
timesincedep = pd.read_csv('/Users/nagrodzkij/data/angry/input/demog/gender_discrim_timeSinceDepression.csv',header=None)
cid = timesincedep.loc[:,0]
indeces = demog.loc[demog['ccid'].isin(cid)].index.tolist()
times = timesincedep.loc[:,1]
demog.loc[indeces,'diagnosisdelay']=times.tolist()
demog.drop(demog[demog['diagnosisdelay']==0].index,inplace=True) # Remove currently depressed participant

ccid_onantidepressants = [220635, 220999,221336,221511,321595,410084,420004,520055,610508]

indeces = demog.loc[demog['ccid'].isin(ccid_onantidepressants)].index.tolist()
demog.loc[:,'antidepressant']=0
demog.loc[indeces,'antidepressant']=1

indeces = demog.loc[demog['ccid'].isin(list_depressed)].index.tolist()
demog.loc[:,'prev_depressed']=0
demog.loc[indeces,'prev_depressed']=1

demog.to_csv(output_path+'table_demog_withedu_byccid.csv')

In [5]:
# COMPARE THE TWO GROUPS - MANN WHITNEY U VS T-TEST

# Exclude participant who is currently depressed
list_all = list(demog.ccid)
list_nondepressed = list(set(list_all) - set(list_depressed))

demog_depressed = demog[demog['ccid'].isin(list_depressed)]
demog_nondepressed = demog[demog['ccid'].isin(list_nondepressed)]

demog = demog[demog.ccid != 520055]


# SEX
number_depressed = len(demog_depressed)
number_nondepressed = len(demog_nondepressed)

female_depressed = sum(demog_depressed['sex']=='F')
female_nondepressed = sum(demog_nondepressed['sex']=='F')

male_depressed = sum(demog_depressed['sex']=='M')
male_nondepressed = sum(demog_nondepressed['sex']=='M')

print('Testing if difference in proportion of females - chi squared test')
data_for_test = [[female_depressed,male_depressed],[female_nondepressed,male_nondepressed]]
stat,p,dof,expected = stats.chi2_contingency(data_for_test)
alpha = 0.05
print('X2 statistic is ' +str(stat))
print("p value is " + str(p))
print("dof is "+str(dof))
N = female_depressed + female_nondepressed + male_depressed + male_nondepressed
num_female = female_depressed + female_nondepressed
prop_female = num_female / N
print("N is "+str(N))
print("number female is "+str(num_female))
print("proportion of female is " +str(prop_female))
print("Number female depressed is " + str(female_depressed) + " which is " +str(female_depressed/(female_depressed+male_depressed)) +" of all depressed")
print("Number female non-depressed is " + str(female_nondepressed) + " which is " +str(female_nondepressed/(female_nondepressed+male_nondepressed)) +" of all nondepressed")
print('')
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

print('')

#########################
# HANDEDNESS
rh_depressed = sum(demog_depressed['handedness']>0)
rh_nondepressed = sum(demog_nondepressed['handedness']>0)

lh_depressed = sum(demog_depressed['handedness']<=0)
lh_nondepressed = sum(demog_nondepressed['handedness']<=0)


print('Testing if difference in proportion of right handed - chi squared test')
data_for_test = [[rh_depressed,lh_depressed],[rh_nondepressed,lh_nondepressed]]
stat,p,dof,expected = stats.chi2_contingency(data_for_test)
alpha = 0.05
print('X2 statistic is ' +str(stat))
print("p value is " + str(p))
print("dof is "+str(dof))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

print('')

#########################
# EDUCATION
dict_for_test = {}
edu_list = ['None','GCSE','A-Level','Uni']
labels = [list_depressed, list_nondepressed, 'Total']
groups = ['Dep','Not dep']
for i in range(0,2):
    filtered = demog[demog['ccid'].isin(labels[i])]
    number = len(filtered)

    for k in range(0,4):
        edu = sum(filtered['highest_qualification']==k)
        edu_prop = edu/number
        demog_table_dict = add_values_in_dict(dict_for_test, 'Edu', [edu_list[k]])
        demog_table_dict = add_values_in_dict(dict_for_test, 'Group', [groups[i]])
        demog_table_dict = add_values_in_dict(dict_for_test, 'Number', [edu])

df_for_test = pd.DataFrame(data=demog_table_dict)

print('Testing if level of education dependent on group - chi squared test')
data_for_test = [[2,1,3,13],[4,7,10,94]]
stat,p,dof,expected = stats.chi2_contingency(data_for_test)
alpha = 0.05
print('X2 statistic is ' +str(stat))
print("p value is " + str(p))
print("dof is "+str(dof))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

print('')

####################
# ACER
print('Testing difference in ACER - choose the correct test')
data_group1 = demog_depressed['acer']
data_group2 = demog_nondepressed['acer']

stat1, p1 = stats.shapiro(data_group1)
stat2, p2 = stats.shapiro(data_group2)

print(p1)
print(p2)


if p1 and p2 > 0.05:
    print('Normally distributed, therefore performing t-test')
    tstat = pg.ttest(data_group1,data_group2,correction=True)
    print('Variance: ',np.var(data_group1), np.var(data_group2))
    print('Mean depressed: ' + str(statistics.mean(data_group1)) + 'Mean non-depressed:' + str(statistics.mean(data_group2)))
    print('Std depressed: '+ str(math.sqrt(np.var(data_group1))) + 'Std non-depressed: ' + str(math.sqrt(np.var(data_group2))))
    print(tstat)
    print('')

else:
    print('Not normally distributed, performing Mann Whitney U test')
    print('Median depressed: ' + str(statistics.median(data_group1)))
    q3_depressed, q1_depressed = np.percentile(data_group1, [75 ,25])
    print('IQR depressed: ' + str(q1_depressed) + ', ' + str(q3_depressed))

    print(' ')

    print('Median non-depressed:' + str(statistics.median(data_group2)))
    q3_nondepressed, q1_nondepressed = np.percentile(data_group2, [75 ,25])
    print('IQR non-depressed: ' + str(q1_nondepressed) + ', ' + str(q3_nondepressed))
    res = scipy.stats.mannwhitneyu(data_group1, data_group2)
    print(res)
    print('')

    data_total = pd.concat([data_group1,data_group2],axis=0)

    print('Median all: ' + str(statistics.median(data_total)))
    q3_total,q1_total = np.percentile(data_total, [75 ,25])
    print('IQR non-depressed: ' + str(q1_total) + ', ' + str(q3_total))
    print('')


###################
# AGE
print('Testing difference in age - choose the correct test')
data_group1 = demog_depressed['age']
data_group2 = demog_nondepressed['age']
stat1, p1 = stats.shapiro(data_group1)
stat2, p2 = stats.shapiro(data_group2)

print(p1)
print(p2)

if p1 and p2 > 0.05:
    print('Normally distributed, choose the correct test')
    tstat = pg.ttest(data_group1,data_group2,correction=True)
    print('Variance: ',np.var(data_group1), np.var(data_group2))
    print('Mean depressed: ' + str(statistics.mean(data_group1)) + 'Mean non-depressed:' + str(statistics.mean(data_group2)))
    print('Std depressed: '+ str(math.sqrt(np.var(data_group1))) + 'Std non-depressed: ' + str(math.sqrt(np.var(data_group2))))
    print(tstat)
    print('')

else:
    print('Not normally distributed, performing Mann Whitney U test')
    print('Median depressed: ' + str(statistics.median(data_group1)))
    q3_depressed, q1_depressed = np.percentile(data_group1, [75 ,25])
    print('IQR depressed: ' + str(q1_depressed) + ', ' + str(q3_depressed))

    print(' ')

    print('Median non-depressed:' + str(statistics.median(data_group2)))
    q3_nondepressed, q1_nondepressed = np.percentile(data_group2, [75 ,25])
    print('IQR non-depressed: ' + str(q1_nondepressed) + ', ' + str(q3_nondepressed))
    res = scipy.stats.mannwhitneyu(data_group1, data_group2)
    print(res)
    print('')
    data_total = pd.concat([data_group1,data_group2],axis=0)

    print('Median all: ' + str(statistics.median(data_total)))
    q3_total,q1_total = np.percentile(data_total, [75 ,25])
    print('IQR non-depressed: ' + str(q1_total) + ', ' + str(q3_total))
    print('')

###################
# HADS depression
print('Testing difference in HADS depression - choose the correct test')
data_group1 = demog_depressed['hads_depression']
data_group2 = demog_nondepressed['hads_depression']
stat1, p1 = stats.shapiro(data_group1)
stat2, p2 = stats.shapiro(data_group2)

print(p1)
print(p2)

if p1 and p2 > 0.05:
    print('Normally distributed, therefore performing t-test')
    tstat = pg.ttest(data_group1,data_group2,correction=True)
    print('Variance: ',np.var(data_group1), np.var(data_group2))
    print('Mean depressed: ' + str(statistics.mean(data_group1)) + 'Mean non-depressed:' + str(statistics.mean(data_group2)))
    print('Std depressed: '+ str(math.sqrt(np.var(data_group1))) + 'Std non-depressed: ' + str(math.sqrt(np.var(data_group2))))
    print(tstat)
    print('')

else:
    print('Not normally distributed, performing Mann Whitney U test')
    print('Median depressed: ' + str(statistics.median(data_group1)))
    q3_depressed, q1_depressed = np.percentile(data_group1, [75 ,25])
    print('IQR depressed: ' + str(q1_depressed) + ', ' + str(q3_depressed))

    print(' ')

    print('Median non-depressed:' + str(statistics.median(data_group2)))
    q3_nondepressed, q1_nondepressed = np.percentile(data_group2, [75 ,25])
    print('IQR non-depressed: ' + str(q1_nondepressed) + ', ' + str(q3_nondepressed))
    res = scipy.stats.mannwhitneyu(data_group1, data_group2)
    print(res)
    print('')
    data_total = pd.concat([data_group1,data_group2],axis=0)

    print('Median all: ' + str(statistics.median(data_total)))
    q3_total,q1_total = np.percentile(data_total, [75 ,25])
    print('IQR non-depressed: ' + str(q1_total) + ', ' + str(q3_total))
    print('')

###################
# Benton
print('Testing difference in Benton scores - choose the correct test')
FaceScores_depressed = FaceScores.loc[list_depressed,:]
data_group1 = FaceScores_depressed['benton']
FaceScores_nondepressed = FaceScores.loc[list_nondepressed,:]
data_group2 = FaceScores_nondepressed['benton']
stat1, p1 = stats.shapiro(data_group1)
stat2, p2 = stats.shapiro(data_group2)


print(p1)
print(p2)


if p1 and p2 > 0.05:
    print('Normally distributed, therefore performing t-test')
    tstat = pg.ttest(data_group1,data_group2,correction=True)
    print('Variance: ',np.var(data_group1), np.var(data_group2))
    print('Mean depressed: ' + str(statistics.mean(data_group1)) + 'Mean non-depressed:' + str(statistics.mean(data_group2)))
    print('Std depressed: '+ str(math.sqrt(np.var(data_group1))) + 'Std non-depressed: ' + str(math.sqrt(np.var(data_group2))))
    print(tstat)
    print('')

else:
    print('Not normally distributed, performing Mann Whitney U test')
    print('Median depressed: ' + str(statistics.median(data_group1)))
    q3_depressed, q1_depressed = np.percentile(data_group1, [75 ,25])
    print('IQR depressed: ' + str(q1_depressed) + ', ' + str(q3_depressed))

    print(' ')

    print('Median non-depressed:' + str(statistics.median(data_group2)))
    q3_nondepressed, q1_nondepressed = np.percentile(data_group2, [75 ,25])
    print('IQR non-depressed: ' + str(q1_nondepressed) + ', ' + str(q3_nondepressed))
    res = scipy.stats.mannwhitneyu(data_group1, data_group2)
    print(res)
    print('')
    data_total = pd.concat([data_group1,data_group2],axis=0)

    print('Median all: ' + str(statistics.median(data_total)))
    q3_total,q1_total = np.percentile(data_total, [75 ,25])
    print('IQR non-depressed: ' + str(q1_total) + ', ' + str(q3_total))
    print('')

Testing if difference in proportion of females - chi squared test
X2 statistic is 0.18071582700065641
p value is 0.6707587905263187
dof is 1
N is 134
number female is 68
proportion of female is 0.5074626865671642
Number female depressed is 11 which is 0.5789473684210527 of all depressed
Number female non-depressed is 57 which is 0.4956521739130435 of all nondepressed

Independent (H0 holds true)

Testing if difference in proportion of right handed - chi squared test
X2 statistic is 0.17639564263920682
p value is 0.6744892432031928
dof is 1
Independent (H0 holds true)

Testing if level of education dependent on group - chi squared test
X2 statistic is 3.035539956383002
p value is 0.3861780066613644
dof is 3
Independent (H0 holds true)

Testing difference in ACER - choose the correct test
0.005454909056425095
2.629358732519904e-06
Not normally distributed, performing Mann Whitney U test
Median depressed: 97
IQR depressed: 92.0, 99.5
 
Median non-depressed:96
IQR non-depressed: 94.0, 98.0