In [63]:
# import libraries
import pandas as pd
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# disable warnings, use w caution
import warnings
warnings.filterwarnings('ignore')

# project specific libs
import os
import matplotlib.pyplot as plt
import pathlib

In [3]:
# project specific path
path = '/Users/KevinBu/Desktop/clemente_lab/Projects/oa/'

In [184]:
# from AC Q2 run of merged saliva stool
df_map = pd.read_csv(path + 'inputs/Qiime2_0/qiime_mapping_file.tsv', sep='\t', index_col=0)
q2_row = df_map.loc['#q2:types',:]
df_map = df_map.drop('#q2:types')

# change index so it matches metadata file
df_map.index = df_map.index.map(lambda x: x.split('.guma')[0])

# drop MOC and elution buffer
df_map = df_map.drop(['MOC.320','elutionbuffer.plate313'])

# grab metadata
df_meta = pd.read_csv(path + 'inputs/Metadata_OA.csv')

# rename 'Run_ID_Saliva' to be correct
df_meta['Timepoints'] = df_meta['Timepoints'].apply(lambda x: 'pre' if x == '0' else 'post')
df_meta['Patient_ID'] = df_meta['Patient_ID'].apply(lambda x: x[:-3])  
df_meta['Study_ID'] = df_meta['Study_ID'].apply(lambda x: x.split('_')[0][-3:]) 

# create per sample type mapping files
type_to_ST = {'saliva':'Saliva','stool':'fecal'}
type_to_df_map = {}

# split into specimen type
for t in type_to_ST:
    # subset on specimen type
    df_map_type = df_map[df_map['SpecimenType'] == type_to_ST[t]]

    # as to not overwrite df meta
    df_meta_type = df_meta.copy()

    # create new sample ID for specimen type and set as index
    df_meta_type['#SampleID'] = df_meta['Patient_ID'] + '-' + df_meta['Study_ID'] + '.' + df_meta['Timepoints'] + '.' + t
    df_meta_type = df_meta_type.set_index('#SampleID')
    #df_meta_stool.index = df_meta_stool.index.str.replace('saliva','stool')

    # create full mapping file
    df_map_type = pd.concat([df_map_type, df_meta_type],axis=1)

    # use only sequenced samples
    df_map_type = df_map_type.dropna(how='any',subset='BarcodeSequence')

    # drop all na
    df_map_type = df_map_type.dropna(how='all',axis=1)

    # populate dict of mapping files
    type_to_df_map[t] = df_map_type

    # export for q2
    df_q2_type = pd.concat([q2_row.to_frame().T, df_map_type])
    df_q2_type.index.name = '#SampleID'
    df_q2_type.iloc[0,:] = 'categorical'
    df_q2_type.to_csv(path + 'inputs/qiime_mapping_file_' + t + '.tsv', sep='\t')
    df_q2_type = df_q2_type[df_q2_type['Adherece_antiinflam'].isin(['Moderate adherence', 'High adherence','categorical'])]
    df_q2_type.to_csv(path + 'inputs/qiime_mapping_file_' + t + '_adh.tsv', sep='\t')

type_to_df_map['stool'].head()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Separate,Timepoint,Together,ContactEmail,ContactName,PrimaryInvestigator,Cohort,RawDataNotes,...,broccoli,Garbanzo_beans,pork,beef,burger,Total_omega3,Adherence_omega3,Total_omega6,Adherence_omega6,Total_o3_o6
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD-001.pre.stool,TTCAGTTCGTTA,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-001.pre.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-001.pre.stool.guma.plate313,...,0.0,0.0,0.0,0.0,0.0,,Low adherence,,Low adherence,0.0
OAD-001.post.stool,CGGCCAGAAGCA,CCGGACTACHVGGGTWTCTAAT,All,post,OAD-001.post.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-001.post.stool.guma.plate313,...,4.0,0.0,0.0,0.0,0.0,48.0,Low adherence,72.0,Low adherence,131.0
OAD-003.pre.stool,GACGTTAAGAAT,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-003.pre.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-003.pre.stool.guma.plate313,...,0.0,0.0,0.0,14.0,0.0,16.8,Low adherence,53.2,High adherence,75.0
OAD-003.post.stool,TCGCTACAGATG,CCGGACTACHVGGGTWTCTAAT,All,post,OAD-003.post.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-003.post.stool.guma.plate313,...,4.0,0.0,0.0,0.0,0.0,48.0,Low adherence,108.0,High adherence,171.0
OAD-004.pre.stool,ATGGGACCTTCA,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-004.pre.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-004.pre.stool.guma.plate313,...,2.8,0.0,0.0,0.0,0.0,2.8,Low adherence,28.0,Low adherence,35.8


In [83]:
# what are we testing?
'''
Hypothesis 1: There will be a measurable difference in WOMAC pain response scores and 
other outcomes from baseline to after the dietary intervention.

Hypothesis 2: There will be an association between oral and gut microbiome and pain outcomes

Hypothesis 3:  The diversity and composition of the saliva and gut microbiome and saliva, 
plasma and gut metabolome profiles will significantly change from baseline to after the 
implementation of ITIS diet among all patients.

Hypothesis 4: The baseline profiles of microbiome and metabolome can be predictive of the 
WOMAC pain response to the diet intervention.
'''

'\nHypothesis 1: There will be a measurable difference in WOMAC pain response scores and \nother outcomes from baseline to after the dietary intervention.\n\nHypothesis 2: There will be an association between oral and gut microbiome and pain outcomes\n\nHypothesis 3:  The diversity and composition of the saliva and gut microbiome and saliva, \nplasma and gut metabolome profiles will significantly change from baseline to after the \nimplementation of ITIS diet among all patients.\n\nHypothesis 4: The baseline profiles of microbiome and metabolome can be predictive of the \nWOMAC pain response to the diet intervention.\n'

In [137]:
###
# Hypothesis 1: There will be a measurable difference in WOMAC pain response scores and 
# other outcomes from baseline to after the dietary intervention.
###

# outcome variables
outcomes = ['VAS_Pt', 'VAS_overall', 'WOMAC_pain', 'WOMAC_stiffness', 'WOMAC_activity', 'WOMAC_total', 'Pain_DETECT', 
            'CES_D', 'Helplesness', 'Magnification', 'Rumination', 'PCS_EN', 'Sleep_distrubance', 'PASE_walk', 'PASE_light', 
            #'PASE_gardening', # Where did this go? gardening_improve is binary
            'BMI']


# hypothesis 1
# create a new df_meta
df_meta = pd.read_csv(path + 'inputs/Metadata_OA.csv')

# rename 'Run_ID_Saliva' to be correct
df_meta['Timepoints'] = df_meta['Timepoints'].apply(lambda x: 'pre' if x == '0' else 'post')
df_meta['Patient_ID'] = df_meta['Patient_ID'].apply(lambda x: x[:-3])  
df_meta['Study_ID'] = df_meta['Study_ID'].apply(lambda x: x.split('_')[0][-3:]) 
df_meta['#SampleID'] = df_meta['Patient_ID'] + '-' + df_meta['Study_ID'] + '.' + df_meta['Timepoints'] + '.stool'
df_meta = df_meta.set_index('#SampleID')

# convert % to floats for calculations down the road
bin = []
cont = []
for w in outcomes:
    df_w = df_meta[w]
    if df_w.nunique() > 2: # do spearman
        df_meta[w] = df_meta[w].astype(str).str.replace('%','').astype(float).values
        cont.append(w)
    else:
        bin.append(w)

print(bin)
print(cont)

# split into all and mod high only
for a in ['all','modhigh']:
    print(a)
    if a == 'all':
        job = 'jobs03'
    if a == 'modhigh':
        job = 'jobs02'
        df_meta = df_meta[df_meta['Adherece_antiinflam'].isin(['Moderate adherence', 'High adherence'])]
    print(len(df_meta))
    
    df_results = pd.DataFrame(columns=['var','effect','pval','stat'])
    # do post treatment vals of binary vars differ from pre treatment 'unpaired'
    for b in bin:
        ct_table_ind=pd.crosstab(df_meta["Timepoints"],df_meta[b])
        chi2_stat, p, dof, expected = scipy.stats.chi2_contingency(ct_table_ind)
        row=pd.DataFrame.from_dict({'var': [b],'effect':[chi2_stat],'pval':[p],'stat':['chi2']})
        df_results = pd.concat([df_results, row])
    
    # fishers exact
    for b in bin:
        ct_table_ind=pd.crosstab(df_meta["Timepoints"],df_meta[b])
        fisher, p = scipy.stats.fisher_exact(ct_table_ind)
        row=pd.DataFrame.from_dict({'var': [b],'effect':[t],'pval':[p],'stat':['fisher']})
        df_results = pd.concat([df_results, row])
        
    # do post treatment vals of continuous vars differ from pre treatment unpaired
    df_pre = df_meta[df_meta['Timepoints'] == 'pre']
    df_post = df_meta[df_meta['Timepoints'] == 'post']
    for c in cont:
        try:
            W,p = scipy.stats.mannwhitneyu(x=df_pre[c].values,y=df_post[c].values, nan_policy='omit')
        except:
            W,p = 0, 1
        row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['mwu']})
        if p < 0.05:
            ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
            sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
        
            # ax.axes.set_title("Title",fontsize=48)
            ax.set_ylabel(c,fontsize=16)
            ax.set_xlabel('Timepoints',fontsize=16)                
            ax.tick_params(labelsize=16)
            sns.despine()
            plt.tight_layout()
            plt.savefig(path + 'outputs/' + job + '/mwu_' + c  + '.pdf')
            plt.close()            
        row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['MWU']})
        df_results = pd.concat([df_results, row])
        
    df_pre = df_meta[df_meta['Timepoints'] == 'pre']
    df_post = df_meta[df_meta['Timepoints'] == 'post']
    for c in cont:
        t,p = scipy.stats.ttest_ind(a=df_pre[c].values,b=df_post[c].values, nan_policy='omit')
        row=pd.DataFrame.from_dict({'var': [c],'effect':[t],'pval':[p],'stat':['ttest']})
        if p < 0.05:
            ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
            sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
        
            # ax.axes.set_title("Title",fontsize=48)
            ax.set_ylabel(c,fontsize=16)
            ax.set_xlabel('Timepoints',fontsize=16)                
            ax.tick_params(labelsize=16)
            sns.despine()
            plt.tight_layout()
            plt.savefig(path + 'outputs/' + job + '/tt_' + c  + '.pdf')
            plt.close()            
        row=pd.DataFrame.from_dict({'var': [c],'effect':[t],'pval':[p],'stat':['ttest']})
        df_results = pd.concat([df_results, row])

    # unpaired and then paired
    df_pre = df_meta[df_meta['Timepoints'] == 'pre']
    df_post = df_meta[df_meta['Timepoints'] == 'post']
    for c in cont:
        W,p = scipy.stats.wilcoxon(x=df_pre[c].values,y=df_post[c].values, nan_policy='omit')
        row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['WSR']})
        if p < 0.05:
            ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
            sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
        
            # ax.axes.set_title("Title",fontsize=48)
            ax.set_ylabel(c,fontsize=16)
            ax.set_xlabel('Timepoints',fontsize=16)                
            ax.tick_params(labelsize=16)
            sns.despine()
            plt.tight_layout()
            plt.savefig(path + 'outputs/' + job + '/wsr_' + c  + '.pdf')
            plt.close()            
        row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['WSR']})
        df_results = pd.concat([df_results, row])

    df_pre = df_meta[df_meta['Timepoints'] == 'pre']
    df_post = df_meta[df_meta['Timepoints'] == 'post']
    for c in cont:
        t,p = scipy.stats.ttest_rel(a=df_pre[c].values,b=df_post[c].values, nan_policy='omit')
        row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['pairedt']})
        if p < 0.05:
            ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
            sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
        
            # ax.axes.set_title("Title",fontsize=48)
            ax.set_ylabel(c,fontsize=16)
            ax.set_xlabel('Timepoints',fontsize=16)                
            ax.tick_params(labelsize=16)
            sns.despine()
            plt.tight_layout()
            plt.savefig(path + 'outputs/' + job + '/pairedt_' + c  + '.pdf')
            plt.close()          
        row=pd.DataFrame.from_dict({'var': [c],'effect':[t],'pval':[p],'stat':['pairedt']})
        df_results = pd.concat([df_results, row])

    df_results.to_csv(path + 'outputs/' + job + '/outcome_testing.tsv', sep='\t')
df_results.head()
    

[]
['VAS_Pt', 'VAS_overall', 'WOMAC_pain', 'WOMAC_stiffness', 'WOMAC_activity', 'WOMAC_total', 'Pain_DETECT', 'CES_D', 'Helplesness', 'Magnification', 'Rumination', 'PCS_EN', 'Sleep_distrubance', 'PASE_walk', 'PASE_light', 'BMI']
all
42
modhigh
32


Unnamed: 0,var,effect,pval,stat
0,VAS_Pt,198.5,0.008316,MWU
0,VAS_overall,168.5,0.131314,MWU
0,WOMAC_pain,170.0,0.0179,MWU
0,WOMAC_stiffness,172.5,0.013052,MWU
0,WOMAC_activity,164.5,0.032592,MWU


In [96]:
# comparing results
# TODO: which test was used? and why was one patient dropped? len(df_meta) is 32 not 30, as would be expected per slides
df = df_results.copy()
df.sort_values(ascending=True,by='pval')
df = df[df['var'] == 'VAS_Pt']
df.head()

Unnamed: 0,var,effect,pval,stat
0,VAS_Pt,198.5,0.008316,MWU
0,VAS_Pt,2.934071,0.006358,ttest
0,VAS_Pt,14.0,0.003357,WSR
0,VAS_Pt,3.367143,0.004234,pairedt


In [187]:
###
# Hypothesis 2: There will be an association between oral and gut microbiome and pain outcomes
###
# construct alpha, beta and paired alpha dataframes
g_to_dfd = {}

for g in ['saliva_adh']:
    # maps diversity type to dataframe
    g_to_dfd[g] = {}
    
    # get alpha diversities
    df_alpha = pd.read_csv(path + 'outputs/Qiime2_' + g + '/metadata.tsv', sep='\t', index_col=0)
    df_alpha = df_alpha.drop('#q2:types')
    df_alpha['SubjectID'] = df_alpha['Patient_ID'] + df_alpha['Study_ID']
    df_alpha = df_alpha[['SubjectID', 'Timepoints', 'shannon_entropy']]
    g_to_dfd[g]['alpha'] = df_alpha

    # get paired alpha div, first drop unpaired samples
    s_remove = []
    for s in list(df_alpha['SubjectID'].values):
        if len(df_alpha[df_alpha['SubjectID'] == s]) != 2:
            s_remove.append(s)
    df_alpha = df_alpha.loc[~df_alpha['SubjectID'].isin(s_remove),:] # careful not to use ([s_remove])
    
    # set vars
    alpha_metric = 'shannon_entropy'
    group_var = 'Timepoints'
    pair_var = 'SubjectID'
    groups = ['pre','post']
    
    # get paired per indiv pair
    pair_to_diff = {}
    for p in list(df_alpha[pair_var].values):
        df = df_alpha[df_alpha[pair_var] == p]
        alpha_0 = float(df[df[group_var] == groups[0]][alpha_metric].values)
        alpha_1 = float(df[df[group_var] == groups[1]][alpha_metric].values)
        pair_to_diff[p] = alpha_0 - alpha_1
    
    df_paired_alpha = pd.DataFrame.from_dict(pair_to_diff, orient='index', columns=[alpha_metric + '_diff'])
    g_to_dfd[g]['paired_alpha'] = df_paired_alpha

    # get beta div
    df_beta = pd.read_csv(path + 'outputs/Qiime2_' + g + '/core_metrics_results/distance-matrix.tsv',
                              sep='\t', index_col=0)
        
    # grab twin to pair dict
    pair_to_ids = {}
    for p in list(df_alpha[pair_var].values):
        df = df_alpha[df_alpha[pair_var] == p]
        id_0 = str(df[df[group_var] == g0].index.values[0])
        id_1 = str(df[df[group_var] == g1].index.values[0])
        pair_to_ids[p] = (id_0, id_1)
    
    # get distances for each twin pair per beta div matrix    
    pair_to_dist = {}
    for p in list(df_alpha[pair_var].values):
        id_0, id_1 = pair_to_ids[p]
        pair_to_dist[p] = df_beta.loc[id_0, id_1]
    
    df_paired_beta = pd.DataFrame.from_dict(pair_to_dist, orient='index', columns=['Unweighted_Unifrac'])
    g_to_dfd[g]['paired_beta'] = df_paired_alpha

g_to_dfd['saliva_adh']['paired_alpha'].head()

Unnamed: 0,shannon_entropy_diff
OAD001,0.173831
OAD003,0.307759
OAD004,-0.875117
OAD005,-0.374889
OAD006,-0.296954


In [225]:
# boxplot of high versus low adherence, Y axis is delta of alpha pre-post
# boxplot of high (H = high and moderate) versus low adherence, Y axis is distance pre-post

# quartiles of response. Pick top vs bottom quartile, do chi-sq with H vs L adherence
# boxplots of top responders vs non-responders in distance pre-post
# boxplots of top vs non-responders in delta of alpha pre-post
d_to_metric = {
    'paired_alpha': 'shannon_entropy_diff'
}
group_var = 'Adherece_antiinflam'

for g in ['saliva_adh']:
    for d in ['paired_alpha']:#'beta']
        # grab relevant diversity df
        df_div = g_to_dfd[g][d]

        # drop duplicates so you have sample mapping to adh
        type_to_df_map[g.split('_')[0]]
        df_dropdup = type_to_df_map['saliva']
        df_dropdup.index = df_dropdup.index.map(lambda x: x.split('.')[0].replace('-',''))
        df_dropdup = df_dropdup.dropna(how='any',subset=group_var,axis=0)
        df_dropdup = df_dropdup[~df_dropdup.index.duplicated(keep='first')]

        # merge with df of metadata var        
        df_merge = pd.concat([df_dropdup,df_div],axis=1)

        # test difference of paired differences between two adherence groups
        u, p = scipy.stats.mannwhitneyu(inter_twin, inter_0)
        print(u, p)
        
        t, p = scipy.stats.ttest_ind(inter_twin, inter_1)
        print(t, p)
                
        # separate plot
        # df_div_plot = g_to_dfd[g][d.split('_')[-1]]
        div_metric = d_to_metric[d]
        df_merge[div_metric] = df_merge[div_metric].map(lambda x: float(x))
        ax = sns.boxplot(data=df_merge, x=group_var, y=div_metric)
        sns.swarmplot(data=df_merge, x=group_var, y=div_metric, palette='dark:grey')
        sns.despine()
        
        plt.tight_layout()
        plt.savefig(path + 'outputs/Qiime2_' + g + '/' + d + '.pdf')
        plt.close()          


718.0 0.6412422862525627
-2.256946541882196 0.026459283105891383


In [230]:
values = list(df_merge[group_var].unique())
[df_merge.groupby([group_var]).get_group(values[i])[div_metric].values for i in range(len(values))]


[array([ 0.17383056, -0.87511659, -0.37488914, -1.04901814,  0.04106066,
                nan,  0.36961036]),
 array([ 0.30775907, -0.29695443,         nan,  0.51197993, -0.88219064,
        -0.06169172,  0.10118925,         nan, -0.52687471]),
 array([nan, nan])]

In [232]:
# df_merge.groupby([group_var]).get_group(values[0])
df_merge[group_var].value_counts()
# df_merge[[group_var] + [div_metric]]

Adherece_antiinflam
High adherence        9
Moderate adherence    7
Low adherence         2
Name: count, dtype: int64

In [235]:
df_meta[group_var].value_counts()
# df_meta.head()


Adherece_antiinflam
High adherence        18
Moderate adherence    14
Low adherence          4
Name: count, dtype: int64

In [104]:
# check levels of contams
subgroups = ['saliva','stool','saliva_adh']#,'stool_adh']
g_to_otu = {}

for g in subgroups:
    # obtained from 'download csv'
    df_otu_meta = pd.read_csv(path + 'outputs/Qiime2_' + g + '/level-6.csv', index_col=0)

    # remove metadata
    keep_cols = []
    for c in list(df_otu_meta.columns.values):
        if 'k__' in c:
            keep_cols.append(c)
    df_otu = df_otu_meta[keep_cols]
    
    # normalize
    df_otu = df_otu.div(df_otu.sum(axis=1),axis=0)

    # store in dict
    g_to_otu[g] = df_otu

df_otu.head()

Unnamed: 0_level_0,k__Archaea;p__Crenarchaeota;c__MCG;o__;f__;g__,k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o__Cenarchaeales;f__Cenarchaeaceae;g__Nitrosopumilus,k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o__Nitrososphaerales;f__Nitrososphaeraceae;g__Candidatus Nitrososphaera,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;g__Natronorubrum,k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter,k__Archaea;p__Euryarchaeota;c__Thermoplasmata;o__E2;__;__,k__Archaea;p__Euryarchaeota;c__Thermoplasmata;o__E2;f__;g__,k__Bacteria;__;__;__;__;__,k__Bacteria;p__Acidobacteria;c__Acidobacteria-6;o__iii1-15;f__;g__,k__Bacteria;p__Acidobacteria;c__Acidobacteria-6;o__iii1-15;f__mb2424;g__,...,k__Bacteria;p__TM7;c__TM7-3;o__CW040;f__;g__,k__Bacteria;p__TM7;c__TM7-3;o__CW040;f__F16;g__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Acholeplasmatales;f__Acholeplasmataceae;g__Acholeplasma,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Mycoplasmatales;f__Mycoplasmataceae;g__Mycoplasma,k__Bacteria;p__Tenericutes;c__Mollicutes;o__RF39;f__;g__,k__Bacteria;p__Verrucomicrobia;c__Opitutae;__;__;__,k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Verrucomicrobiaceae;g__Akkermansia,k__Bacteria;p__Verrucomicrobia;c__[Pedosphaerae];o__[Pedosphaerales];f__Ellin517;g__,k__Bacteria;p__[Thermi];c__Deinococci;o__Thermales;f__Thermaceae;g__Meiothermus,k__Bacteria;p__[Thermi];c__Deinococci;o__Thermales;f__Thermaceae;g__Thermus
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD-001.post.saliva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017899,0.0,0.0,...,0.0,0.0,0.000357,0.000713,0.0,0.0,0.0,0.0,0.0,0.0
OAD-001.pre.saliva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001921,0.0,0.0,...,0.0,0.000108,0.0,0.000298,0.0,0.0,0.000162,0.0,0.0,0.0
OAD-003.post.saliva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002727,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-003.pre.saliva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0022,0.0,0.0,...,0.0,5.4e-05,0.0,0.0,0.000188,0.0,0.0,0.0,0.0,0.0
OAD-004.post.saliva,0.0,0.0,0.0,0.0,0.0,0.0,5.1e-05,0.011091,0.0,0.0,...,0.0,0.0,0.000279,0.002127,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
# Alpha div paired
# paired beta, comparing intra-indiv difference pre_post to inter pre and inter post

for g in ['saliva_adh']: #subgroups:
    df_alpha = pd.read_csv(path + 'outputs/Qiime2_' + g + '/metadata.tsv', sep='\t', index_col=0)
    df_alpha = df_alpha.drop('#q2:types')
    df_alpha['SubjectID'] = df_alpha['Patient_ID'] + df_alpha['Study_ID']
    df_alpha = df_alpha[['SubjectID', 'Timepoints', 'shannon_entropy']]
    
    # drop unpaired samples
    s_remove = []
    for s in list(df_alpha['SubjectID'].values):
        if len(df_alpha[df_alpha['SubjectID'] == s]) != 2:
            s_remove.append(s)
    df_alpha = df_alpha.loc[~df_alpha['SubjectID'].isin(s_remove),:] # careful not to use ([s_remove])
    
    # set vars
    alpha_metric = 'shannon_entropy'
    group_var = 'Timepoints'
    pair_var = 'SubjectID'
    groups = ['pre','post']
    
    # get paired per indiv pair
    pair_to_diff = {}
    for p in list(df_alpha[pair_var].values):
        df = df_alpha[df_alpha[pair_var] == p]
        alpha_0 = float(df[df[group_var] == groups[0]][alpha_metric].values)
        alpha_1 = float(df[df[group_var] == groups[1]][alpha_metric].values)
        pair_to_diff[p] = alpha_0 - alpha_1
    
    df_paired_alpha = pd.DataFrame.from_dict(pair_to_diff, orient='index', columns=[alpha_metric + '_diff'])
    
    # one-sided t-test, n.s.; RA-UA values 
    t, p = scipy.stats.ttest_1samp(df_paired_alpha[alpha_metric + '_diff'],popmean=0)
    print(t, p)
    
    s, p = scipy.stats.wilcoxon(df_paired_alpha[alpha_metric + '_diff'])
    print(s, p)
    
    # separate
    df_alpha[alpha_metric] = df_alpha[alpha_metric].map(lambda x: float(x))
    ax = sns.boxplot(data=df_alpha, x=group_var, y=alpha_metric)
    sns.swarmplot(data=df_alpha, x=group_var, y=alpha_metric, palette='dark:grey')
    sns.despine()

    plt.tight_layout()
    plt.savefig(path + 'outputs/Qiime2_' + g + '/alpha.pdf')
    plt.close()          

    # now do beta
    df_beta = pd.read_csv(path + 'outputs/Qiime2_' + g + '/core_metrics_results/distance-matrix.tsv',
                          sep='\t', index_col=0)
    
    # set vars
    alpha_metric = 'shannon_entropy'
    group_var = 'Timepoints'
    pair_var = 'SubjectID'
    groups = ['pre','post']
    g0, g1 = groups[0], groups[1]
    
    # grab twin to pair dict
    pair_to_ids = {}
    for p in list(df_alpha[pair_var].values):
        df = df_alpha[df_alpha[pair_var] == p]
        id_0 = str(df[df[group_var] == g0].index.values[0])
        id_1 = str(df[df[group_var] == g1].index.values[0])
        pair_to_ids[p] = (id_0, id_1)
    
    # get distances for each twin pair per beta div matrix    
    pair_to_dist = {}
    for p in list(df_alpha[pair_var].values):
        id_0, id_1 = pair_to_ids[p]
        pair_to_dist[p] = df_beta.loc[id_0, id_1]
    
    df_paired_beta = pd.DataFrame.from_dict(pair_to_dist, orient='index', columns=['Unweighted_Unifrac'])
    
    # grab inter RA distances
    # this is from unweighted_Timepoint_significance.qzv -> download as tsv
    df_raw = pd.read_csv(path + 'outputs/Qiime2_' + g + '/raw_data.tsv', 
                         sep='\t', index_col=0)
    df_0 = df_raw[df_raw['Group1'] == g0]
    df_0 = df_0[df_0['Group2'] == g0]
    df_1 = df_raw[df_raw['Group1'] == g1]
    df_1 = df_1[df_1['Group2'] == g1]
    
    # compare distances
    inter_twin = df_paired_beta['Unweighted_Unifrac'].values
    inter_0 = df_0['Distance'].values
    inter_1 = df_1['Distance'].values
    
    u, p = scipy.stats.mannwhitneyu(inter_twin, inter_0)
    #print(u, p)
    
    t, p = scipy.stats.ttest_ind(inter_twin, inter_1)
    #print(t, p)
    
    t, p = scipy.stats.ttest_ind(inter_0, inter_1)
    # print(t, p)
    
    f, p = scipy.stats.f_oneway(inter_0, inter_1, inter_twin)
    print(f, p)
    
    category = ['intra_twin_pair']*len(inter_twin) + ['inter_' + g0 + '_only']*len(inter_0) + ['inter_' + g1 + '_only']*len(inter_1)
    distances = list(inter_twin) + list(inter_0) + list(inter_1)
    df_dist = pd.DataFrame(data=np.array([category,distances]).T, columns=['category','distance'])
    df_dist['distance'] = df_dist['distance'].astype(float)
    df_dist.to_csv(path + 'outputs/Qiime2_' + g + '/inter_intra_beta_dist.tsv',sep='\t')
                         
    sns.boxplot(data=df_dist, x='category', y='distance')
    sns.swarmplot(data=df_dist, x='category', y='distance', color='black')
    sns.despine()

    plt.tight_layout()
    plt.savefig(path + 'outputs/Qiime2_' + g + '/beta.pdf')
    plt.close()          

-1.3787450789345674 0.1931323321786041
30.0 0.305419921875
11.976305593222191 1.194161818002086e-05


In [160]:
# test cols for taxa of interest
test_cols = []
for c in list(df_otu_saliva.columns.values):
    if 'g__Arthrobacter' in c  or 'g__Selenomonas' in c: # flexi, 'f__Phyllobacteriaceae;g__' in c
        test_cols.append(c)
df_otu_saliva_test = df_otu_saliva[test_cols]
#df_otu_test.mean(axis=1)
# very low abundance overall, 3 samples inpost and 1 sample in pre with 0.00002 as max rel abundance 
print(test_cols)
# individual box/barplots
df_lefse = pd.concat([df_otu_saliva, df_otu_meta_saliva['Timepoints']], axis=1)

for t in test_cols:
    print(t)
    ax = sns.boxplot(data=df_lefse, x='Timepoints', y=t)
    sns.swarmplot(data=df_lefse, x='Timepoints', y=t, palette='dark:grey', hue=None)
    
    # ax.axes.set_title("Title",fontsize=48)
    ax.set_ylabel("Abundance " + t.split('__')[-1],fontsize=16)
    ax.set_xlabel("Timepoint",fontsize=16)
    ax.tick_params(labelsize=16)
    sns.despine()
    plt.tight_layout()
    plt.savefig(path + 'outputs/jobs00/' + t.split('__')[-1] + '.pdf')
    plt.close()

print('Done')


# test cols for taxa of interest
test_cols = []
for c in list(df_otu_stool.columns.values):
    if 'g__[Eubacterium]' in c  or 'g__Lactococcus' in c: # flexi, 'f__Phyllobacteriaceae;g__' in c
        test_cols.append(c)
df_otu_stool = df_otu_stool[test_cols]
#df_otu_test.mean(axis=1)
# very low abundance overall, 3 samples inpost and 1 sample in pre with 0.00002 as max rel abundance 
print(test_cols)
# individual box/barplots
df_lefse = pd.concat([df_otu_stool, df_otu_meta_stool['Timepoints']], axis=1)

for t in test_cols:
    print(t)
    ax = sns.boxplot(data=df_lefse, x='Timepoints', y=t)
    sns.swarmplot(data=df_lefse, x='Timepoints', y=t, palette='dark:grey', hue=None)
    
    # ax.axes.set_title("Title",fontsize=48)
    ax.set_ylabel("Abundance " + t.split('__')[-1],fontsize=16)
    ax.set_xlabel("Timepoint",fontsize=16)
    ax.tick_params(labelsize=16)
    sns.despine()
    plt.tight_layout()
    plt.savefig(path + 'outputs/jobs01/' + t.split('__')[-1] + '.pdf')
    plt.close()

print('Done')

['k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Arthrobacter', 'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Selenomonas']
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Arthrobacter
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Selenomonas
Done


In [253]:
# saliva stool correlations
df_saliva_meta = pd.read_csv(path + 'inputs/saliva_normalized.csv')
df_saliva_meta['#SampleID'] = df_saliva_meta['Study_ID'] + df_saliva_meta['Label'] + '.saliva'
df_saliva_meta['#SampleID'] = df_saliva_meta['#SampleID'].str.replace('_1mAfter diet', '.post')
df_saliva_meta['#SampleID'] = df_saliva_meta['#SampleID'].str.replace('_0Baseline', '.pre')    
df_saliva_meta['#SampleID'] = df_saliva_meta['#SampleID'].str.replace('D0', 'D-0')    
df_saliva_meta = df_saliva_meta.drop(['Study_ID','Label'], axis=1)
df_saliva_meta = df_saliva_meta.set_index('#SampleID')
# df_map_saliva has these VAOAD-001.pre.saliva as sample id index
#df_saliva = pd.concat([df_saliva_map

# saliva stool correlations
df_stool_meta = pd.read_csv(path + 'inputs/stool_normalized.csv')
df_stool_meta['#SampleID'] = df_stool_meta['Study_ID'] + df_stool_meta['Time'] + '.stool'
df_stool_meta['#SampleID'] = df_stool_meta['#SampleID'].str.replace('_1mAfter diet', '.post')
df_stool_meta['#SampleID'] = df_stool_meta['#SampleID'].str.replace('_0Baseline', '.pre')    
df_stool_meta['#SampleID'] = df_stool_meta['#SampleID'].str.replace('D0', 'D-0')    
df_stool_meta = df_stool_meta.drop(['Study_ID','Time'], axis=1)
df_stool_meta = df_stool_meta.set_index('#SampleID')
df_stool_meta.head()#['#SampleID']

Unnamed: 0_level_0,H_Pyridoxamine,H_Thiamine,H_Melezitose,H_Phosphocholine,H_SN_Glycero_3_Phophocholine,H_Gamma_Valerobetaine,H_N_Acetylneuraminic_acid,H_Pyridoxine,H_N_Acetylneuraminic_Acid,H_N_Acetylmuramic_Acid,...,H_Nonhydroxylated_bile_acid,H_Hydroxydodecanoic_acid,H_Omega_Hydroxydodecanoate,H_Delta_Methyldodecenoic_Acid,H_Hydroxydecanoic_acid,H_Hydroxydecanoate,H_Methylpentanoic_acid,H_Monohydroxylated_bile_acid,H_Palmitoyl_ethanolamide,H_Linoleic_Acid
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VAOAD-001.post.stool,0.312342,-0.312477,0.119853,-0.492876,-0.671376,-0.202952,0.203945,-0.455869,0.352628,0.151464,...,0.054336,-0.330037,-0.886045,-0.314427,-0.322677,-0.650357,-0.361695,0.132794,-0.067464,-0.112641
VAOAD-009.post.stool,0.000428,-0.481251,-0.179337,0.983217,0.167836,0.067823,0.364897,-0.371942,0.545435,-0.007664,...,-0.022614,-0.334936,-0.802118,-0.92947,-0.369578,-0.566431,0.075375,-0.519189,-0.180631,-0.418351
OAD-001.post.stool,-0.285212,-0.556445,-0.651325,0.357509,-0.401875,-0.295787,0.137736,-0.165902,0.11217,0.096791,...,-0.923477,-0.589028,-1.535141,-0.525277,-0.278312,-0.600484,0.019657,-0.576831,-0.901479,-1.089906
VAOAD-004.post.stool,0.140904,1.089055,-0.668077,-1.061336,-0.370602,-0.329379,-0.654931,0.142971,-1.006689,-0.100373,...,-0.354297,0.449897,0.489189,0.413723,0.47425,0.661963,0.245553,0.27929,-0.444319,-0.912164
VAOAD-011.post.stool,-0.346855,1.028262,-0.450323,-0.025939,-0.223029,-0.766374,-0.53841,-1.091423,-0.328522,-0.524686,...,-0.385428,0.419748,0.429119,0.333756,0.448461,0.598846,0.390035,0.287682,-0.532141,-0.75228


In [183]:
# CUTIE, binarize womac var and then do correlations with 
# Hypothesis 2: There will be an association between oral and gut microbiome and pain outcomes
# can do paired diff in resp vars and paired diff (or ratio) of taxa
# should throw in diversity as well (alpha, and beta pcs for unpaired) and alpha diff, beta diff for paired

for g in ['saliva_adh']: #subgroups:
    # grab df_alpha
    df_alpha = pd.read_csv(path + 'outputs/Qiime2_' + g + '/metadata.tsv', sep='\t', index_col=0)
    df_alpha = df_alpha.drop('#q2:types')
    df_alpha['SubjectID'] = df_alpha['Patient_ID'] + df_alpha['Study_ID']
    df_alpha = df_alpha[['SubjectID', 'Timepoints', 'shannon_entropy']]
    
    # grab beta ordination
    df_beta = pd.read_csv(path + 'outputs/Qiime2_' + g + '/core_metrics_results/ordination.txt', sep='\t', skiprows=9)#, index_col=0)
    df_beta = df_beta.T.reset_index().T
    df_beta = df_beta.iloc[:,:6]
    df_beta.columns = ['SampleID'] + ['PC' + str(x+1) for x in range(5)]
    df_beta = df_beta.set_index('SampleID')
    df_beta = df_beta.drop(['Biplot','Site constraints'])

    # merge
    df_asv = pd.concat([df_alpha, df_beta, g_to_otu[g]],axis=1)


df_asv.head()

Unnamed: 0,SubjectID,Timepoints,shannon_entropy,PC1,PC2,PC3,PC4,PC5,k__Archaea;p__Crenarchaeota;c__MCG;o__;f__;g__,k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o__Cenarchaeales;f__Cenarchaeaceae;g__Nitrosopumilus,...,k__Bacteria;p__TM7;c__TM7-3;o__CW040;f__;g__,k__Bacteria;p__TM7;c__TM7-3;o__CW040;f__F16;g__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Acholeplasmatales;f__Acholeplasmataceae;g__Acholeplasma,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Mycoplasmatales;f__Mycoplasmataceae;g__Mycoplasma,k__Bacteria;p__Tenericutes;c__Mollicutes;o__RF39;f__;g__,k__Bacteria;p__Verrucomicrobia;c__Opitutae;__;__;__,k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Verrucomicrobiaceae;g__Akkermansia,k__Bacteria;p__Verrucomicrobia;c__[Pedosphaerae];o__[Pedosphaerales];f__Ellin517;g__,k__Bacteria;p__[Thermi];c__Deinococci;o__Thermales;f__Thermaceae;g__Meiothermus,k__Bacteria;p__[Thermi];c__Deinococci;o__Thermales;f__Thermaceae;g__Thermus
OAD-001.pre.saliva,OAD001,pre,5.2272672687325,-0.162871,-0.014977,0.103169,0.065343,0.042138,0.0,0.0,...,0.0,0.000108,0.0,0.000298,0.0,0.0,0.000162,0.0,0.0,0.0
OAD-001.post.saliva,OAD001,post,5.05343670971886,-0.1962674411102426,0.0148029241475558,0.1299940991335384,0.0806438199834523,0.0048333307598193,0.0,0.0,...,0.0,0.0,0.000357,0.000713,0.0,0.0,0.0,0.0,0.0,0.0
OAD-003.pre.saliva,OAD003,pre,4.10550210161817,-0.162979,-0.059797,0.040011,0.000859,-0.015767,0.0,0.0,...,0.0,5.4e-05,0.0,0.0,0.000188,0.0,0.0,0.0,0.0,0.0
OAD-003.post.saliva,OAD003,post,3.79774303082421,-0.173818,-0.060887,-0.060165,0.058827,-0.055001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-004.pre.saliva,OAD004,pre,3.67216546403196,-0.172471,0.019728,0.191765,0.023478,0.073976,0.0,0.0,...,0.0,0.0,0.000787,0.004605,0.0,0.0,0.0,0.0,0.0,0.0
