In [1]:
import pandas as pd
import scipy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import chi2_contingency
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [5]:
hcsc_data = pd.read_csv('results/HCSC_data_with_clusters.csv', index_col=0)
adni_data = pd.read_csv('results/ADNI_data_with_clusters.csv', index_col=0)

changes_hcsc = {'Amil1_42R':'AB42', 'RatioR':'Ratio AB42/AB40', 'pTAU':'p-tau',
                  'Sexo':'Sex', 'TAUtotal':'t-tau', 'Edadactual':'Age at baseline',
                  'Escolaridad':'Education years', 'n3_ALL_ratio':'Clustering all biomarkers',
                   'MMSE':'MMSE at baseline', 'DxclinBreve':'Clinical diagnosis'}

changes_adni = {'abeta_0':'AB42', 'ptau_0':'p-tau', 'ttau_0':'t-tau',
                  'AGE':'Age at baseline', 'PTGENDER': 'Sex',
                  'PTEDUCAT':'Education years', 'n3_ALL':'Clustering all biomarkers',
                  'MMSE_bl':'MMSE at baseline', 'DX_bl':'Clinical diagnosis'}

hcsc_data.rename(columns=changes_hcsc, inplace=True)
adni_data.rename(columns=changes_adni, inplace=True)

hcsc_data['Clinical diagnosis'].replace({'EA GDS3':'LMCI', 'EApreMCI':'EMCI', 'Control/QSM':'SMC', 'Otros':'MCI-NN'}, inplace=True)

In [7]:
cont_vars = ['Age at baseline', 'Education years', 'MMSE at baseline']
cat_vars  = ['Clinical diagnosis', 'Sex']

print('HCSC dataset')
print('-------------------------------------------')
print()
print('Means')
print(round(hcsc_data[cont_vars + ['Clinical diagnosis']].groupby('Clinical diagnosis').mean(), 2).T)
print()
print('Standard Deviations')
print(round(hcsc_data[cont_vars + ['Clinical diagnosis']].groupby('Clinical diagnosis').std(), 2).T)
print()
for c in cat_vars:
    print(round(pd.crosstab(hcsc_data[c], hcsc_data['Clinical diagnosis'], normalize='columns'), 4)*100)
    print(pd.crosstab(hcsc_data[c], hcsc_data['Clinical diagnosis']))
    print()
print()
print()

print('ADNI dataset')
print('-------------------------------------------')
print()
print('Means')
print(round(adni_data[cont_vars + ['Clinical diagnosis']].groupby('Clinical diagnosis').mean(), 2).T)
print()
print('Standard Deviations')
print(round(adni_data[cont_vars + ['Clinical diagnosis']].groupby('Clinical diagnosis').std(), 2).T)
print()
for c in cat_vars:
    print(round(pd.crosstab(adni_data[c], adni_data['Clinical diagnosis'], normalize='columns'), 4)*100)
    print(pd.crosstab(adni_data[c], adni_data['Clinical diagnosis']))
    print()

HCSC dataset
-------------------------------------------

Means
Clinical diagnosis   EMCI   LMCI  MCI-NN    SMC
Age at baseline     74.19  74.98   66.52  65.93
Education years      9.04  10.46   12.10  12.08
MMSE at baseline    26.29  24.26   26.88  28.03

Standard Deviations
Clinical diagnosis  EMCI  LMCI  MCI-NN   SMC
Age at baseline     6.11  4.97   10.16  9.56
Education years     5.39  4.42    5.44  4.39
MMSE at baseline    2.59  3.57    2.33  1.97

Clinical diagnosis   EMCI   LMCI  MCI-NN    SMC
Clinical diagnosis                             
EMCI                100.0    0.0     0.0    0.0
LMCI                  0.0  100.0     0.0    0.0
MCI-NN                0.0    0.0   100.0    0.0
SMC                   0.0    0.0     0.0  100.0
Clinical diagnosis  EMCI  LMCI  MCI-NN  SMC
Clinical diagnosis                         
EMCI                  26     0       0    0
LMCI                   0    63       0    0
MCI-NN                 0     0      33    0
SMC                    0     0    

In [8]:
print(hcsc_data.reset_index().groupby('Clustering all biomarkers')['NHC_LCR'].count())
print()

print(adni_data.reset_index().groupby('Clustering all biomarkers')['RID'].count())
print()

Clustering all biomarkers
0    73
1    64
2    28
Name: NHC_LCR, dtype: int64

Clustering all biomarkers
0    175
1    161
2     83
Name: RID, dtype: int64



In [9]:
cont_vars_hcsc = ['AB42', 'Ratio AB42/AB40', 'p-tau', 't-tau', 'Age at baseline', 'Education years', 'MMSE at baseline']
cont_vars_adni = ['AB42', 'p-tau', 't-tau', 'Age at baseline', 'Education years', 'MMSE at baseline']

cat_vars = ['Clinical diagnosis', 'Sex']

print('HCSC dataset')
print('-------------------------------------------')
print()
print(round(hcsc_data[cont_vars_hcsc + ['Clustering all biomarkers']].groupby('Clustering all biomarkers').mean(), 2).T)
print()
print(round(hcsc_data[cont_vars_hcsc + ['Clustering all biomarkers']].groupby('Clustering all biomarkers').std(), 2).T)
print()
for c in cat_vars:
    print(round(pd.crosstab(hcsc_data[c], hcsc_data['Clustering all biomarkers'], normalize='columns'), 4)*100)
    print()
print()
print()

print('ADNI dataset')
print('-------------------------------------------')
print()
print(round(adni_data[cont_vars_adni + ['Clustering all biomarkers']].groupby('Clustering all biomarkers').mean(), 2).T)
print()
print(round(adni_data[cont_vars_adni + ['Clustering all biomarkers']].groupby('Clustering all biomarkers').std(), 2).T)
print()
for c in cat_vars:
    print(round(pd.crosstab(adni_data[c], adni_data['Clustering all biomarkers'], normalize='columns'), 4)*100)
    print()

HCSC dataset
-------------------------------------------

Clustering all biomarkers        0       1        2
AB42                       1254.30  686.16   656.57
Ratio AB42/AB40               0.10    0.05     0.04
p-tau                        34.64   79.01   173.31
t-tau                       273.63  516.19  1060.36
Age at baseline              66.44   74.58    73.57
Education years              12.09    9.69    10.88
MMSE at baseline             27.43   25.26    24.16

Clustering all biomarkers       0       1       2
AB42                       448.85  285.19  177.40
Ratio AB42/AB40              0.01    0.01    0.02
p-tau                       11.05   24.62   36.79
t-tau                       93.41  153.98  242.24
Age at baseline              9.07    7.29    5.59
Education years              4.69    4.78    5.08
MMSE at baseline             2.72    2.72    4.22

Clustering all biomarkers      0      1      2
Clinical diagnosis                            
EMCI                        4.

In [14]:
# Chi-squared tests for categorical variables
print('HCSC dataset')
print('-------------------------------------------')
print()
for c in cat_vars:
    contigency = pd.crosstab(hcsc_data[c], hcsc_data['Clustering all biomarkers'], normalize='columns')
    chstat, pvalue, dof, expected = chi2_contingency(contigency)
    if pvalue < 0.001:
        s = '***'
    elif pvalue < 0.01:
        s = '**'
    elif pvalue < 0.05:
        s = '*'
    else:
        s = 'ns'
    print(f'{round(chstat, 2)}\t{round(pvalue, 4)}\t{s}\t{c}')
print()
print()
    
print('ADNI dataset')
print('-------------------------------------------')
print()
for c in cat_vars:
    contigency = pd.crosstab(adni_data[c], adni_data['Clustering all biomarkers'], normalize='columns')
    chstat, pvalue, dof, expected = chi2_contingency(contigency)
    if pvalue < 0.001:
        s = '***'
    elif pvalue < 0.01:
        s = '**'
    elif pvalue < 0.05:
        s = '*'
    else:
        s = 'ns'
    print(f'{round(chstat, 2)}\t{round(pvalue, 4)}\t{s}\t{c}')


HCSC dataset
-------------------------------------------

2.09	0.9115	ns	Clinical diagnosis
0.08	0.9591	ns	Sex


ADNI dataset
-------------------------------------------

0.44	0.9985	ns	Clinical diagnosis
0.03	0.9832	ns	Sex


In [15]:
# ANOVA tests for continuous variables
print('HCSC dataset')
print('-------------------------------------------')
print()
for b in cont_vars_hcsc:
    cluster0 = hcsc_data.loc[hcsc_data['Clustering all biomarkers'] == 0][b].dropna().values
    cluster1 = hcsc_data.loc[hcsc_data['Clustering all biomarkers'] == 1][b].dropna().values
    cluster2 = hcsc_data.loc[hcsc_data['Clustering all biomarkers'] == 2][b].dropna().values

    # stats f_oneway functions takes the groups as input and returns ANOVA F and p value
    fvalue, pvalue = stats.f_oneway(cluster0, cluster1, cluster2)
    if pvalue < 0.001:
        s = '***'
    elif pvalue < 0.01:
        s = '**'
    elif pvalue < 0.05:
        s = '*'
    else:
        s = 'ns'

    print(b)
    print(f'ANOVA: F = {round(fvalue, 2)}; p-value = {pvalue}; significant = {s}')

    # perform multiple pairwise comparison (Tukey HSD)
    m_comp = pairwise_tukeyhsd(endog=hcsc_data[b], groups=hcsc_data['Clustering all biomarkers'], alpha=0.05)
    print(m_comp)
    print()

print()
print()
    
print('ADNI dataset')
print('-------------------------------------------')
print()
for b in cont_vars_adni:
    cluster0 = adni_data.loc[adni_data['Clustering all biomarkers'] == 0][b].dropna().values
    cluster1 = adni_data.loc[adni_data['Clustering all biomarkers'] == 1][b].dropna().values
    cluster2 = adni_data.loc[adni_data['Clustering all biomarkers'] == 2][b].dropna().values

    fvalue, pvalue = stats.f_oneway(cluster0, cluster1, cluster2)
    if pvalue < 0.001:
        s = '***'
    elif pvalue < 0.01:
        s = '**'
    elif pvalue < 0.05:
        s = '*'
    else:
        s = 'ns'

    print(b)
    print(f'ANOVA F: {round(fvalue, 2)}\tp-value: {pvalue}\tsignificant: {s}')

    m_comp = pairwise_tukeyhsd(endog=adni_data[b], groups=adni_data['Clustering all biomarkers'], alpha=0.05)
    print(m_comp)
    print()

HCSC dataset
-------------------------------------------

AB42
ANOVA: F = 53.69; p-value = 1.2898431754714473e-18; significant = ***
   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
group1 group2  meandiff p-adj    lower     upper   reject
---------------------------------------------------------
     0      1 -568.1366    0.0  -712.156 -424.1172   True
     0      2 -597.7228    0.0 -784.6758 -410.7699   True
     1      2  -29.5863 0.9284 -220.1486  160.9761  False
---------------------------------------------------------

Ratio AB42/AB40
ANOVA: F = 441.33; p-value = 2.716112457795668e-66; significant = ***
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1  -0.0515    0.0 -0.0563 -0.0466   True
     0      2  -0.0635    0.0 -0.0698 -0.0572   True
     1      2   -0.012 0.0001 -0.0184 -0.0056   True
----------------------------------------------------

p-