In [1]:
# import libraries
import pandas as pd
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# disable warnings, use w caution
import warnings
warnings.filterwarnings('ignore')

# project specific libs
import os
import matplotlib.pyplot as plt
import pathlib

In [2]:
# project specific path
path = '/Users/KevinBu/Desktop/clemente_lab/Projects/oa/'

In [3]:
# from AC Q2 run of merged saliva stool
df_map = pd.read_csv(path + 'inputs/Qiime2_0/qiime_mapping_file.tsv', sep='\t', index_col=0)
q2_row = df_map.loc['#q2:types',:]
df_map = df_map.drop('#q2:types')

# change index so it matches metadata file
df_map.index = df_map.index.map(lambda x: x.split('.guma')[0])

# drop MOC and elution buffer
df_map = df_map.drop(['MOC.320','elutionbuffer.plate313'])

# grab metadata
df_meta = pd.read_csv(path + 'inputs/Metadata_OA.csv')

# rename 'Run_ID_Saliva' to be correct
df_meta['Timepoints'] = df_meta['Timepoints'].apply(lambda x: 'pre' if x == '0' else 'post')
df_meta['Patient_ID'] = df_meta['Patient_ID'].apply(lambda x: x[:-3])  
df_meta['Study_ID'] = df_meta['Study_ID'].apply(lambda x: x.split('_')[0][-3:]) 
df_outcomes = df_meta.copy()

# create per sample type mapping files
type_to_ST = {'saliva':'Saliva','stool':'fecal'}
type_to_df_map = {}

# split into specimen type
for t in type_to_ST:
    # subset on specimen type
    df_map_type = df_map[df_map['SpecimenType'] == type_to_ST[t]]

    # as to not overwrite df meta
    df_meta_type = df_meta.copy()

    # create new sample ID for specimen type and set as index
    df_meta_type['#SampleID'] = df_meta['Patient_ID'] + '-' + df_meta['Study_ID'] + '.' + df_meta['Timepoints'] + '.' + t
    df_meta_type = df_meta_type.set_index('#SampleID')

    # create full mapping file
    df_map_type = pd.concat([df_map_type, df_meta_type],axis=1)

    # use only sequenced samples
    df_map_type = df_map_type.dropna(how='any',subset='BarcodeSequence')

    # drop all na columns
    df_map_type = df_map_type.dropna(how='all',axis=1)

    # drop VAD OA 015 because misdx with PsA not OA
    if t == 'saliva':
        df_map_type = df_map_type.drop(['VAOAD-015.pre.saliva','VAOAD-015.post.saliva'])
    if t == 'stool':
        df_map_type = df_map_type.drop(['VAOAD-015.pre.stool','VAOAD-015.post.stool'])

    # populate dict of mapping files
    type_to_df_map[t] = df_map_type

    # export for q2
    df_q2_type = pd.concat([q2_row.to_frame().T, df_map_type])
    df_q2_type.index.name = '#SampleID'
    df_q2_type.iloc[0,:] = 'categorical'
    df_q2_type.to_csv(path + 'inputs/qiime_mapping_file_' + t + '.tsv', sep='\t')
    df_q2_type = df_q2_type[df_q2_type['Adherece_antiinflam'].isin(['Moderate adherence', 'High adherence','categorical'])]
    df_q2_type.to_csv(path + 'inputs/qiime_mapping_file_' + t + '_adh.tsv', sep='\t')
    df_q2_type = df_q2_type[df_q2_type['Adherece_antiinflam'].isin(['Moderate adherence', 'High adherence','categorical'])]
    df_q2_resp = df_q2_type[df_q2_type['WOMAC_P_Response'].isin(['categorical', 'Response'])]
    df_q2_resp.to_csv(path + 'inputs/qiime_mapping_file_' + t + '_adh_response.tsv', sep='\t')
    df_q2_nonresp = df_q2_type[df_q2_type['WOMAC_P_Response'].isin(['categorical', 'No response'])]
    df_q2_nonresp.to_csv(path + 'inputs/qiime_mapping_file_' + t + '_adh_noresponse.tsv', sep='\t')

type_to_df_map['stool'].head()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Separate,Timepoint,Together,ContactEmail,ContactName,PrimaryInvestigator,Cohort,RawDataNotes,...,broccoli,Garbanzo_beans,pork,beef,burger,Total_omega3,Adherence_omega3,Total_omega6,Adherence_omega6,Total_o3_o6
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD-001.pre.stool,TTCAGTTCGTTA,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-001.pre.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-001.pre.stool.guma.plate313,...,0.0,0.0,0.0,0.0,0.0,,Low adherence,,Low adherence,0.0
OAD-001.post.stool,CGGCCAGAAGCA,CCGGACTACHVGGGTWTCTAAT,All,post,OAD-001.post.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-001.post.stool.guma.plate313,...,4.0,0.0,0.0,0.0,0.0,48.0,Low adherence,72.0,Low adherence,131.0
OAD-003.pre.stool,GACGTTAAGAAT,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-003.pre.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-003.pre.stool.guma.plate313,...,0.0,0.0,0.0,14.0,0.0,16.8,Low adherence,53.2,High adherence,75.0
OAD-003.post.stool,TCGCTACAGATG,CCGGACTACHVGGGTWTCTAAT,All,post,OAD-003.post.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-003.post.stool.guma.plate313,...,4.0,0.0,0.0,0.0,0.0,48.0,Low adherence,108.0,High adherence,171.0
OAD-004.pre.stool,ATGGGACCTTCA,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-004.pre.stool.guma.plate313,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,OAD-004.pre.stool.guma.plate313,...,2.8,0.0,0.0,0.0,0.0,2.8,Low adherence,28.0,Low adherence,35.8


In [4]:
###
# Hypothesis 1: There will be a measurable difference in WOMAC pain response scores and 
# other outcomes from baseline to after the dietary intervention.
###

# outcome variables
outcomes = ['VAS_Pt', 'VAS_overall', 'WOMAC_pain', 'WOMAC_stiffness', 'WOMAC_activity', 'WOMAC_total', 'Pain_DETECT', 
            'CES_D', 'Helplesness', 'Magnification', 'Rumination', 'PCS_EN', 'Sleep_distrubance', 'PASE_walk', 'PASE_light', 
            #'PASE_gardening', # Where did this go? gardening_improve is binary
            'BMI']


# hypothesis 1
# create a new df_meta
df_meta = pd.read_csv(path + 'inputs/Metadata_OA.csv')

# rename 'Run_ID_Saliva' to be correct
df_meta['Timepoints'] = df_meta['Timepoints'].apply(lambda x: 'pre' if x == '0' else 'post')
df_meta['Patient_ID'] = df_meta['Patient_ID'].apply(lambda x: x[:-3])  
df_meta['Study_ID'] = df_meta['Study_ID'].apply(lambda x: x.split('_')[0][-3:]) 
df_meta['#SampleID'] = df_meta['Patient_ID'] + '-' + df_meta['Study_ID'] + '.' + df_meta['Timepoints'] + '.stool'
df_meta = df_meta.set_index('#SampleID')

# convert % to floats for calculations down the road
bin = []
cont = []
df_paired_os = []
for w in outcomes:
    df_w = df_meta[w]
    if df_w.nunique() > 2: # do spearman
        df_meta[w] = df_meta[w].astype(str).str.replace('%','').astype(float).values
        cont.append(w)
    else:
        bin.append(w)

    # compute difference and store it
    df_md = df_meta.copy()
    df_md['SubjectID'] = df_md['Patient_ID'] + df_md['Study_ID']
    
    # first drop unpaired samples
    s_remove = []
    for s in list(df_md['SubjectID'].values):
        if len(df_md[df_md['SubjectID'] == s]) != 2:
            s_remove.append(s)
    df_md = df_md.loc[~df_md['SubjectID'].isin(s_remove),:] # careful not to use ([s_remove])
    
    # set vars
    group_var = 'Timepoints'
    pair_var = 'SubjectID'
    groups = ['pre','post']
    
    # get paired per indiv pair
    pair_to_diff = {}
    for p in list(df_md[pair_var].values):
        df = df_md[df_md[pair_var] == p]
        t0 = float(df[df[group_var] == groups[0]][w].values)
        tf = float(df[df[group_var] == groups[1]][w].values)
        pair_to_diff[p] = tf - t0
    
    df_paired_o = pd.DataFrame.from_dict(pair_to_diff, orient='index', columns=[w + '_diff'])
    df_paired_os.append(df_paired_o)

df_meta_paired = pd.concat([*df_paired_os], axis=1)    

print(bin)
print(cont)

# split into all and mod high only
a = 'modhigh'
job = 'jobs02'
df_meta = df_meta[df_meta['Adherece_antiinflam'].isin(['Moderate adherence', 'High adherence'])]
        
print(len(df_meta))

df_results = pd.DataFrame(columns=['var','effect','pval','stat'])
# do post treatment vals of binary vars differ from pre treatment 'unpaired'
for b in bin:
    ct_table_ind=pd.crosstab(df_meta["Timepoints"],df_meta[b])
    chi2_stat, p, dof, expected = scipy.stats.chi2_contingency(ct_table_ind)
    row=pd.DataFrame.from_dict({'var': [b],'effect':[chi2_stat],'pval':[p],'stat':['chi2']})
    df_results = pd.concat([df_results, row])

# fishers exact
for b in bin:
    ct_table_ind=pd.crosstab(df_meta["Timepoints"],df_meta[b])
    fisher, p = scipy.stats.fisher_exact(ct_table_ind)
    row=pd.DataFrame.from_dict({'var': [b],'effect':[t],'pval':[p],'stat':['fisher']})
    df_results = pd.concat([df_results, row])
    
# do post treatment vals of continuous vars differ from pre treatment unpaired
df_pre = df_meta[df_meta['Timepoints'] == 'pre']
df_post = df_meta[df_meta['Timepoints'] == 'post']
for c in cont:
    try:
        W,p = scipy.stats.mannwhitneyu(x=df_pre[c].values,y=df_post[c].values, nan_policy='omit')
    except:
        W,p = 0, 1
    row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['mwu']})
    if p < 0.05:
        ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
        sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
    
        # ax.axes.set_title("Title",fontsize=48)
        ax.set_ylabel(c,fontsize=16)
        ax.set_xlabel('Timepoints',fontsize=16)                
        ax.tick_params(labelsize=16)
        sns.despine()
        plt.tight_layout()
        plt.savefig(path + 'outputs/' + job + '/mwu_' + c  + '.pdf')
        plt.close()            
    row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['MWU']})
    df_results = pd.concat([df_results, row])
    
df_pre = df_meta[df_meta['Timepoints'] == 'pre']
df_post = df_meta[df_meta['Timepoints'] == 'post']
for c in cont:
    t,p = scipy.stats.ttest_ind(a=df_pre[c].values,b=df_post[c].values, nan_policy='omit')
    row=pd.DataFrame.from_dict({'var': [c],'effect':[t],'pval':[p],'stat':['ttest']})
    if p < 0.05:
        ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
        sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
    
        # ax.axes.set_title("Title",fontsize=48)
        ax.set_ylabel(c,fontsize=16)
        ax.set_xlabel('Timepoints',fontsize=16)                
        ax.tick_params(labelsize=16)
        sns.despine()
        plt.tight_layout()
        plt.savefig(path + 'outputs/' + job + '/tt_' + c  + '.pdf')
        plt.close()            
    row=pd.DataFrame.from_dict({'var': [c],'effect':[t],'pval':[p],'stat':['ttest']})
    df_results = pd.concat([df_results, row])

# unpaired and then paired
df_pre = df_meta[df_meta['Timepoints'] == 'pre']
df_post = df_meta[df_meta['Timepoints'] == 'post']
for c in cont:
    W,p = scipy.stats.wilcoxon(x=df_pre[c].values,y=df_post[c].values, nan_policy='omit')
    row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['WSR']})
    if p < 0.05:
        ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
        sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
    
        # ax.axes.set_title("Title",fontsize=48)
        ax.set_ylabel(c,fontsize=16)
        ax.set_xlabel('Timepoints',fontsize=16)                
        ax.tick_params(labelsize=16)
        sns.despine()
        plt.tight_layout()
        plt.savefig(path + 'outputs/' + job + '/wsr_' + c  + '.pdf')
        plt.close()            
    row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['WSR']})
    df_results = pd.concat([df_results, row])

df_pre = df_meta[df_meta['Timepoints'] == 'pre']
df_post = df_meta[df_meta['Timepoints'] == 'post']
for c in cont:
    t,p = scipy.stats.ttest_rel(a=df_pre[c].values,b=df_post[c].values, nan_policy='omit')
    row=pd.DataFrame.from_dict({'var': [c],'effect':[W],'pval':[p],'stat':['pairedt']})
    if p < 0.05:
        ax = sns.boxplot(data=df_meta, x='Timepoints', y=c, orient='v')
        sns.swarmplot(data=df_meta, x='Timepoints', y=c, palette='dark:grey', hue=None, orient='v')
    
        # ax.axes.set_title("Title",fontsize=48)
        ax.set_ylabel(c,fontsize=16)
        ax.set_xlabel('Timepoints',fontsize=16)                
        ax.tick_params(labelsize=16)
        sns.despine()
        plt.tight_layout()
        plt.savefig(path + 'outputs/' + job + '/pairedt_' + c  + '.pdf')
        plt.close()          
    row=pd.DataFrame.from_dict({'var': [c],'effect':[t],'pval':[p],'stat':['pairedt']})
    df_results = pd.concat([df_results, row])

df_results.to_csv(path + 'outputs/' + job + '/outcome_testing.tsv', sep='\t')
df_results.head()
    

[]
['VAS_Pt', 'VAS_overall', 'WOMAC_pain', 'WOMAC_stiffness', 'WOMAC_activity', 'WOMAC_total', 'Pain_DETECT', 'CES_D', 'Helplesness', 'Magnification', 'Rumination', 'PCS_EN', 'Sleep_distrubance', 'PASE_walk', 'PASE_light', 'BMI']
32


Unnamed: 0,var,effect,pval,stat
0,VAS_Pt,198.5,0.008316,MWU
0,VAS_overall,168.5,0.131314,MWU
0,WOMAC_pain,170.0,0.0179,MWU
0,WOMAC_stiffness,172.5,0.013052,MWU
0,WOMAC_activity,164.5,0.032592,MWU


In [5]:
###
# Hypothesis 2: There will be an association between oral and gut microbiome and pain outcomes
###
# construct alpha, beta and paired alpha dataframes
g_to_dfd = {}
# g_test = ['stool','saliva_adh', 'stool_adh', 'saliva']
g_test = ['saliva_adh', 'stool_adh']

for g in g_test:
    # maps diversity type to dataframe
    g_to_dfd[g] = {}

    # get alpha diversities
    df_alpha = pd.read_csv(path + 'outputs/Qiime2_' + g + '/metadata.tsv', sep='\t', index_col=0)
    df_alpha = df_alpha.drop('#q2:types')
    df_alpha['SubjectID'] = df_alpha['Patient_ID'] + df_alpha['Study_ID']
    df_alpha = df_alpha[['SubjectID', 'Timepoints', 'shannon_entropy']]
    g_to_dfd[g]['alpha'] = df_alpha

    # get paired alpha div, first drop unpaired samples
    s_remove = []
    for s in list(df_alpha['SubjectID'].values):
        if len(df_alpha[df_alpha['SubjectID'] == s]) != 2:
            s_remove.append(s)
    df_alpha = df_alpha.loc[~df_alpha['SubjectID'].isin(s_remove),:] # careful not to use ([s_remove])
    
    # set vars
    alpha_metric = 'shannon_entropy'
    group_var = 'Timepoints'
    pair_var = 'SubjectID'
    groups = ['pre','post']
    
    # get paired per indiv pair
    pair_to_diff = {}
    for p in list(df_alpha[pair_var].values):
        df = df_alpha[df_alpha[pair_var] == p]
        alpha_0 = float(df[df[group_var] == groups[0]][alpha_metric].values)
        alpha_1 = float(df[df[group_var] == groups[1]][alpha_metric].values)
        pair_to_diff[p] = alpha_1 - alpha_0
    
    df_paired_alpha = pd.DataFrame.from_dict(pair_to_diff, orient='index', columns=[alpha_metric + '_diff'])
    g_to_dfd[g]['paired_alpha'] = df_paired_alpha

    # get beta div
    df_beta = pd.read_csv(path + 'outputs/Qiime2_' + g + '/core_metrics_results/distance-matrix.tsv',
                              sep='\t', index_col=0)
        
    # grab twin to pair dict
    pair_to_ids = {}
    for p in list(df_alpha[pair_var].values):
        df = df_alpha[df_alpha[pair_var] == p]
        id_0 = str(df[df[group_var] == groups[0]].index.values[0])
        id_1 = str(df[df[group_var] == groups[1]].index.values[0])
        pair_to_ids[p] = (id_0, id_1)
    
    # get distances for each twin pair per beta div matrix    
    pair_to_dist = {}
    for p in list(df_alpha[pair_var].values):
        id_0, id_1 = pair_to_ids[p]
        pair_to_dist[p] = df_beta.loc[id_0, id_1]
    
    df_paired_beta = pd.DataFrame.from_dict(pair_to_dist, orient='index', columns=['Unweighted_Unifrac'])
    g_to_dfd[g]['paired_beta'] = df_paired_beta

g_to_dfd['stool_adh']['paired_beta'].head()

Unnamed: 0,Unweighted_Unifrac
OAD001,0.6672
OAD003,0.2837
OAD004,0.4237
OAD005,0.3257
OAD006,0.1022


In [6]:
# Testing for each sample type for (1) all and (2) high adh only
# (A) Chisq of quartiles with adherence 
# (B) MWU/TT unpaired of outcomes against distance

def chisq_of_df_cols(df, c1, c2):
    groupsizes = df.groupby([c1, c2]).size()
    ctsum = groupsizes.unstack(c1)
    # fillna(0) is necessary to remove any NAs which will cause exceptions
    return(scipy.stats.chi2_contingency(ctsum.fillna(0)))

d_to_metric = {
    'alpha': 'shannon_entropy',
}
group_var = 'Adherece_antiinflam'

dfd_to_merge = {}
g_test = ['saliva_adh', 'stool_adh']

for g in g_test:
    dfd_to_merge[g] = {}

gs = []
ds = []
os = []
stats = []
ts = []
ps = []

arr = [gs,ds,os,stats,ts,ps]

def append_results(arr, val):
    for a,v in zip(arr,val):
        a.append(v)
    return arr

# for each sample type, grab relevant mapping file
# g_test = saliva, saliva_adh, stool, etc.
for g in g_test:
    for d in d_to_metric:
        # grab relevant diversity df
        df_div = g_to_dfd[g][d]

        if d == 'alpha':
            # df_div = df_div[df_div['Timepoints'] == 'pre']
            # df_div = df_div.set_index('SubjectID').drop('Timepoints',axis=1)
            # df_div = df_div.set_index('SubjectID').drop('Timepoints',axis=1)
            df_div['shannon_entropy'] = df_div['shannon_entropy'].astype(float)

        # merge with df of metadata var        
        # grab relevant sample IDs
        # g = saliva_adh
        idx = 'Run_ID_' + g.split('_')[0].capitalize()
        df_meta_sub = df_meta.dropna(subset=idx)
        df_meta_sub = df_meta_sub.set_index(idx)
        df_merge = pd.concat([df_meta_sub,df_div],axis=1)

        # subset on adh only
        df_merge = df_merge[df_merge[group_var].isin(['Moderate adherence', 'High adherence'])]

        # test association of div with outcomes
        for o in ['WOMAC_P_Response']: #  outcomes:                    
            # test association of adherence with pain outcomes
            div_metric = d_to_metric[d]
            x,p,dof,ef = chisq_of_df_cols(df_merge, group_var, o)
            arr = append_results(arr, ['metadata',group_var,o,'chisq',x,p])
            
            ax = sns.boxplot(data=df_merge, x=o, y=div_metric)
            sns.swarmplot(data=df_merge, x=o, y=div_metric, palette='dark:grey')
            sns.despine()
        
            plt.tight_layout()
            plt.savefig(path + 'outputs/jobs06/rvnr_nondiff_' + g + '_' + o + '_' + d + '.pdf')
            plt.close()          
        
            u, p = scipy.stats.mannwhitneyu(df_merge[df_merge[o] == 'Response'][div_metric].values, 
                                            df_merge[df_merge[o] == 'No response'][div_metric].values, 
                                            nan_policy='omit')

            arr = append_results(arr, [g,d,o,'mwu',t,p])

            t, p = scipy.stats.ttest_ind(df_merge[df_merge[o] == 'Response'][div_metric].values, 
                                            df_merge[df_merge[o] == 'No response'][div_metric].values, 
                                            nan_policy='omit')

            arr = append_results(arr, [g,d,o,'tt',t,p])

        # save results
        dfd_to_merge[g][d] = df_merge

        # export to Q2
        # df_q2_type = df_merge.set_index(['Together'])
        df_q2_type = df_merge.copy()
        q2_row = pd.Series(data=['categorical' for i in range(len(df_merge.columns))], 
                           index=list(df_merge.columns.values), dtype=str, name='#q2:types')
        df_q2_type = pd.concat([q2_row.to_frame().T, df_q2_type])
        df_q2_type.index.name = '#SampleID'
        df_q2_type.index = df_q2_type.index.map(lambda x: x.split('.guma')[0])
        # df_q2_type.to_csv(path + 'inputs/qiime_mapping_file_' + d + '_' + g + 'aggregate_outcomes.tsv', sep='\t')

df_results = pd.DataFrame.from_dict({
    'group': gs,
    'div': ds,
    'outcome': os,
    'statistic': stats,
    'test_stat': ts,
    'pval': ps
})
# df_results.to_csv(path + 'outputs/df_results_aggregate.tsv', sep='\t')
# df_results.head()
# df_results[df_results['pval'] < 0.05]
df_results

Unnamed: 0,group,div,outcome,statistic,test_stat,pval
0,metadata,Adherece_antiinflam,WOMAC_P_Response,chisq,0.232031,0.630022
1,saliva_adh,alpha,WOMAC_P_Response,mwu,1.153857,0.788413
2,saliva_adh,alpha,WOMAC_P_Response,tt,0.756449,0.45645
3,metadata,Adherece_antiinflam,WOMAC_P_Response,chisq,0.232031,0.630022
4,stool_adh,alpha,WOMAC_P_Response,mwu,0.756449,0.980536
5,stool_adh,alpha,WOMAC_P_Response,tt,0.182238,0.856866


In [7]:
# expand previous to all data, paired and unpaired diversities
d_to_metric = {
    'pre_alpha': 'shannon_entropy',
    'post_alpha': 'shannon_entropy',
    'paired_alpha': 'shannon_entropy_diff',
    'paired_beta': 'Unweighted_Unifrac',
}
group_var = 'Adherece_antiinflam'

dfd_to_merge = {}
g_test = ['stool_adh','saliva_adh']
for g in g_test:
    dfd_to_merge[g] = {}

gs = []
ds = []
os = []
stats = []
ts = []
ps = []

arr = [gs,ds,os,stats,ts,ps]

def append_results(arr, val):
    for a,v in zip(arr,val):
        a.append(v)
    return arr

# for each sample type, grab relevant mapping file
# g_test = saliva, saliva_adh, stool, etc.
d_to_map = {}

for g in g_test:
    # drop duplicates so you have sample mapping to adh
    df_map_sub = type_to_df_map[g.split('_')[0]]
    df_map_sub.index = df_map_sub.index.map(lambda x: x.split('.')[0].replace('-',''))
    df_map_sub = df_map_sub.dropna(how='any',subset=group_var,axis=0)           
    
    # figure out which samples to keep
    # i.e. samples that have a pre and post time point
    keep = []
    for i in list(df_map_sub.index.values):
        if len(df_map_sub.loc[i,:]) == 2:
            keep.append(i)

    # get unique entires in sorted order
    # at this point we are only concerned with differences in values, 
    # as we've dropped samples with only one endpoint val
    save = []
    [save.append(x) for x in keep if x not in save]
    df_map_sub = df_map_sub.loc[save,:]

    # this double populates as OAD001 is an index twice, so the diff fills to both the pre and post col
    for o in outcomes:
        df_map_sub[o + '_diff'] = df_map_sub[df_map_sub['Timepoint'] == 'post'][o] - df_map_sub[df_map_sub['Timepoint'] == 'pre'][o] 

    # here we keep only the pre, but everything is identical b/w pre and post
    df_dropdup = df_map_sub[~df_map_sub.index.duplicated(keep='first')]
    d_to_map[g] = df_dropdup
    
    for d in d_to_metric:
        # grab relevant diversity df
        if d == 'pre_alpha' or d == 'post_alpha':
            df_div = g_to_dfd[g]['alpha']
        else:
            df_div = g_to_dfd[g][d]
        div_metric = d_to_metric[d]

        # when associating alpha div vs outcomes, look at if starting adiv predicts outcome
        if d == 'pre_alpha':
            df_div = df_div[df_div['Timepoints'] == 'pre']
            df_div = df_div.set_index('SubjectID').drop('Timepoints',axis=1)
            df_div = df_div.astype(float)

        if d == 'post_alpha':
            df_div = df_div[df_div['Timepoints'] == 'post']
            df_div = df_div.set_index('SubjectID').drop('Timepoints',axis=1)
            df_div = df_div.astype(float)

        # merge with df of metadata var        
        df_merge = pd.concat([df_dropdup,df_div],axis=1)

        # drop na in barcodes if samples not sequenced both pre and post
        df_merge = df_merge.dropna(how='any',subset='BarcodeSequence')
        
        # test association of div with outcomes
        for o in ['WOMAC_P_Response']:                     
            # test association of adherence with pain outcomes            
            ax = sns.boxplot(data=df_merge, y=o, x=div_metric)
            sns.swarmplot(data=df_merge, y=o, x=div_metric, palette='dark:grey')
            sns.despine()
        
            plt.tight_layout()
            plt.savefig(path + 'outputs/jobs06/WOMAC_barplot_' + g + '_' + o + '_' + d + '.pdf')
            plt.close()          

            # MWU and ttest
            u, p = scipy.stats.mannwhitneyu(df_merge[df_merge[o] == 'Response'][div_metric].values, 
                                            df_merge[df_merge[o] == 'No response'][div_metric].values, 
                                            nan_policy='omit')

            arr = append_results(arr, [g,d,o,'mwu',t,p])

            t, p = scipy.stats.ttest_ind(df_merge[df_merge[o] == 'Response'][div_metric].values, 
                                            df_merge[df_merge[o] == 'No response'][div_metric].values, 
                                            nan_policy='omit')

            arr = append_results(arr, [g,d,o,'tt',t,p])
         
        # save results
        dfd_to_merge[g][d] = df_merge

        # export to Q2
        df_q2_type = df_merge.set_index(['Together'])
        q2_row = pd.Series(data=['categorical' for i in range(len(df_merge.columns))], 
                           index=list(df_merge.columns.values), dtype=str, name='#q2:types')
        df_q2_type = pd.concat([q2_row.to_frame().T, df_q2_type])
        df_q2_type.index.name = '#SampleID'
        df_q2_type.index = df_q2_type.index.map(lambda x: x.split('.guma')[0])
        # df_q2_type.to_csv(path + 'inputs/qiime_mapping_file_' + d + '_' + g + '_outcomes.tsv', sep='\t')

df_results = pd.DataFrame.from_dict({
    'group': gs,
    'div': ds,
    'outcome': os,
    'statistic': stats,
    'test_stat': ts,
    'pval': ps
})
# df_results.to_csv(path + 'outputs/df_results_diff.tsv', sep='\t')

# df_results.head()
# the div==alpha results test whether pre-alpha div state associates with pain outcome changes (differences) quartiles
# the div==paired_alpha and paired_beta test whether the alpha and betas change in a similar way with the pain outcome
print(len(df_results[df_results['pval'] < 0.05]))
df_results


0


Unnamed: 0,group,div,outcome,statistic,test_stat,pval
0,stool_adh,pre_alpha,WOMAC_P_Response,mwu,0.182238,0.343434
1,stool_adh,pre_alpha,WOMAC_P_Response,tt,-0.594256,0.565542
2,stool_adh,post_alpha,WOMAC_P_Response,mwu,-0.594256,0.638889
3,stool_adh,post_alpha,WOMAC_P_Response,tt,0.769556,0.459355
4,stool_adh,paired_alpha,WOMAC_P_Response,mwu,0.769556,0.431818
5,stool_adh,paired_alpha,WOMAC_P_Response,tt,1.289488,0.226254
6,stool_adh,paired_beta,WOMAC_P_Response,mwu,1.289488,0.876263
7,stool_adh,paired_beta,WOMAC_P_Response,tt,-1.03358,0.325681
8,saliva_adh,pre_alpha,WOMAC_P_Response,mwu,-1.03358,0.755051
9,saliva_adh,pre_alpha,WOMAC_P_Response,tt,0.565329,0.58431


In [8]:
# test if the differences in diversity are significantly different from 0 for paired alpha
for g in g_test:
    print(g)
    for d in ['paired_alpha']:
        df_div = g_to_dfd[g][d] 
        df_map = d_to_map[g]

        df_merge = pd.concat([df_div, df_map], axis=1)
        for o in [['Response','No response'],['Response'], ['No response']]:
            df = df_merge[df_merge['WOMAC_P_Response'].isin(o)]
            print(o)
            print(scipy.stats.wilcoxon(df['shannon_entropy_diff'], nan_policy='omit'))
df


stool_adh
['Response', 'No response']
WilcoxonResult(statistic=25.0, pvalue=0.30126953125)
['Response']
WilcoxonResult(statistic=4.0, pvalue=0.109375)
['No response']
WilcoxonResult(statistic=7.0, pvalue=1.0)
saliva_adh
['Response', 'No response']
WilcoxonResult(statistic=24.0, pvalue=0.26611328125)
['Response']
WilcoxonResult(statistic=9.0, pvalue=0.46875)
['No response']
WilcoxonResult(statistic=5.0, pvalue=0.625)


Unnamed: 0,shannon_entropy_diff,BarcodeSequence,LinkerPrimerSequence,Separate,Timepoint,Together,ContactEmail,ContactName,PrimaryInvestigator,Cohort,...,Pain_DETECT_diff,CES_D_diff,Helplesness_diff,Magnification_diff,Rumination_diff,PCS_EN_diff,Sleep_distrubance_diff,PASE_walk_diff,PASE_light_diff,BMI_diff
OAD001,-0.116436,AGGACAAACTAT,CCGGACTACHVGGGTWTCTAAT,Text,pre,OAD-001.pre.saliva,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,...,-2.0,-1.0,-2.5,5.5,1.0,4.0,9.0,11.51,14.29,
OAD003,-0.327749,TCTTCCTAAAGT,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-003.pre.saliva,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,...,-4.5,1.0,-1.0,-1.0,-2.0,-5.5,-14.0,0.0,0.0,
OAD004,0.866474,CTTGTGCGACAA,CCGGACTACHVGGGTWTCTAAT,All,pre,OAD-004.pre.saliva,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,NonVA,...,-1.0,3.0,-1.0,-1.0,-1.0,-3.0,0.0,0.0,0.0,3.264032
VAOAD001,0.007278,CTTACACTGCTT,CCGGACTACHVGGGTWTCTAAT,All,pre,VAOAD-001.pre.saliva,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,VA,...,-0.5,5.5,0.0,0.0,0.0,0.0,-2.5,0.0,0.0,-0.081446
VAOAD012,0.535614,GCGTGGTCATTA,CCGGACTACHVGGGTWTCTAAT,All,pre,VAOAD-012.pre.saliva,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,VA,...,-1.5,0.0,2.0,-2.0,-0.5,-0.5,-1.5,0.0,6.72,-1.072664
VAOAD009,,TTGGACGTCCAC,CCGGACTACHVGGGTWTCTAAT,All,pre,VAOAD-009.pre.saliva,rebecca.blank@nyulangone.org,Rebecca Blank,Jose Scher,VA,...,-4.0,2.0,1.0,0.0,0.5,1.5,2.5,0.0,0.0,-0.139974


In [9]:
# Alpha div paired
# paired beta, comparing intra-indiv difference pre_post to inter pre and inter post
g_test = ['stool_adh','saliva_adh']
subgroups = ['All', 'Response', 'No response']

for g in g_test: #subgroups:
    print(g)
    df_a = pd.read_csv(path + 'outputs/Qiime2_' + g + '/metadata.tsv', sep='\t', index_col=0)
    df_a = df_a.drop('#q2:types')

    for sg in subgroups:
        print(sg)
        if sg == 'All':
            df_alpha = df_a.copy()
        else:
            df_alpha = df_a[df_a['WOMAC_P_Response'] == sg]
        
        df_alpha['SubjectID'] = df_alpha['Patient_ID'] + df_alpha['Study_ID']
        df_alpha = df_alpha[['SubjectID', 'Timepoints', 'shannon_entropy']]
        
        # drop unpaired samples
        s_remove = []
        for s in list(df_alpha['SubjectID'].values):
            if len(df_alpha[df_alpha['SubjectID'] == s]) != 2:
                s_remove.append(s)
        df_alpha = df_alpha.loc[~df_alpha['SubjectID'].isin(s_remove),:] # careful not to use ([s_remove])
        
        # set vars
        alpha_metric = 'shannon_entropy'
        group_var = 'Timepoints'
        pair_var = 'SubjectID'
        groups = ['pre','post']
        
        # get paired per indiv pair
        pair_to_diff = {}
        for p in list(df_alpha[pair_var].values):
            df = df_alpha[df_alpha[pair_var] == p]
            alpha_0 = float(df[df[group_var] == groups[0]][alpha_metric].values)
            alpha_1 = float(df[df[group_var] == groups[1]][alpha_metric].values)
            pair_to_diff[p] = alpha_0 - alpha_1
        
        df_paired_alpha = pd.DataFrame.from_dict(pair_to_diff, orient='index', columns=[alpha_metric + '_diff'])
        
        # one-sided t-test, n.s.; RA-UA values 
        # t, p = scipy.stats.ttest_1samp(df_paired_alpha[alpha_metric + '_diff'],popmean=0)
    
        print('paired alpha wilcoxon')
        s, p = scipy.stats.wilcoxon(df_paired_alpha[alpha_metric + '_diff'])
        print(s, p)
        
        # separate
        df_alpha[alpha_metric] = df_alpha[alpha_metric].map(lambda x: float(x))
        ax = sns.boxplot(data=df_alpha, x=group_var, y=alpha_metric)
        sns.swarmplot(data=df_alpha, x=group_var, y=alpha_metric, palette='dark:grey')
        sns.despine()
    
        plt.tight_layout()
        plt.savefig(path + 'outputs/Qiime2_' + g + '/' + sg + '_alpha.pdf')
        plt.close()          
    
        # now do beta
        df_beta = pd.read_csv(path + 'outputs/Qiime2_' + g + '/core_metrics_results/distance-matrix.tsv',
                              sep='\t', index_col=0)
        
        # set vars
        alpha_metric = 'shannon_entropy'
        group_var = 'Timepoints'
        pair_var = 'SubjectID'
        groups = ['pre','post']
        g0, g1 = groups[0], groups[1]
        
        # grab twin to pair dict
        pair_to_ids = {}
        for p in list(df_alpha[pair_var].values):
            df = df_alpha[df_alpha[pair_var] == p]
            id_0 = str(df[df[group_var] == g0].index.values[0])
            id_1 = str(df[df[group_var] == g1].index.values[0])
            pair_to_ids[p] = (id_0, id_1)
        
        # get distances for each twin pair per beta div matrix    
        pair_to_dist = {}
        for p in list(df_alpha[pair_var].values):
            id_0, id_1 = pair_to_ids[p]
            pair_to_dist[p] = df_beta.loc[id_0, id_1]
        
        df_paired_beta = pd.DataFrame.from_dict(pair_to_dist, orient='index', columns=['Unweighted_Unifrac'])
        
        # grab inter RA distances
        # this is from unweighted_Timepoint_significance.qzv -> download as tsv
        df_raw = pd.read_csv(path + 'outputs/Qiime2_' + g + '/raw_data.tsv', 
                             sep='\t', index_col=0)
        df_0 = df_raw[df_raw['Group1'] == g0]
        df_0 = df_0[df_0['Group2'] == g0]
        df_1 = df_raw[df_raw['Group1'] == g1]
        df_1 = df_1[df_1['Group2'] == g1]
        
        # compare distances
        inter_twin = df_paired_beta['Unweighted_Unifrac'].values
        inter_0 = df_0['Distance'].values
        inter_1 = df_1['Distance'].values
        
        u, p = scipy.stats.mannwhitneyu(inter_twin, inter_0)
        #print(u, p)
        
        t, p = scipy.stats.ttest_ind(inter_twin, inter_1)
        #print(t, p)
        
        t, p = scipy.stats.ttest_ind(inter_0, inter_1)
        # print(t, p)
        
        f, p = scipy.stats.f_oneway(inter_0, inter_1, inter_twin)
        # print('inter intra anova')
        p# rint(f, p)
        
        category = ['intra_twin_pair']*len(inter_twin) + ['inter_' + g0 + '_only']*len(inter_0) + ['inter_' + g1 + '_only']*len(inter_1)
        distances = list(inter_twin) + list(inter_0) + list(inter_1)
        df_dist = pd.DataFrame(data=np.array([category,distances]).T, columns=['category','distance'])
        df_dist['distance'] = df_dist['distance'].astype(float)
        df_dist.to_csv(path + 'outputs/Qiime2_' + g + '/' + sg + '_inter_intra_beta_dist.tsv',sep='\t')
                             
        sns.boxplot(data=df_dist, x='category', y='distance')
        sns.swarmplot(data=df_dist, x='category', y='distance', color='black')
        sns.despine()
    
        plt.tight_layout()
        plt.savefig(path + 'outputs/Qiime2_' + g + '/' + sg + '_beta.pdf')
        plt.close()          

stool_adh
All
paired alpha wilcoxon
33.0 0.414306640625
Response
paired alpha wilcoxon
4.0 0.109375
No response
paired alpha wilcoxon
7.0 1.0
saliva_adh
All
paired alpha wilcoxon
26.0 0.19091796875
Response
paired alpha wilcoxon
9.0 0.46875
No response
paired alpha wilcoxon
5.0 0.625


In [10]:
# alpha plots
for g in ['saliva','stool']:
    df_alpha = g_to_dfd[g + '_adh']['alpha']
    df_map = type_to_df_map[g].copy()
    df_map = df_map.set_index('Together')
    df_map = df_map[['Timepoint','WOMAC_P_Response']]
    df_map.index = df_map.index.map(lambda x: x.split('.g')[0])
    
    df_merge = pd.concat([df_alpha, df_map], axis=1)
    df_merge = df_merge.dropna()
    
    # sns.set_theme()
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="ticks", rc=custom_params)
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Timepoint', y='shannon_entropy', data=df_merge,palette=['b','r'])#, hue='Timepoint', palette=['b','r'])
    sns.swarmplot(x='Timepoint', y='shannon_entropy', data=df_merge,color='black')
    sns.despine()#(left=True, bottom=True)
    # ax.axes.set_title("",fontsize=48)\n",
    plt.tight_layout()
    ax.set_ylabel("Shannon Entropy",fontsize=16)
    ax.set_xlabel("Diagnosis",fontsize=16)
    ax.tick_params(labelsize=16)
    
    # plt.tight_layout()
    plt.savefig(path + 'outputs/Qiime2_' + g + '_adh/alpha_all.pdf')
    plt.close()

df_merge.head()

Unnamed: 0,SubjectID,Timepoints,shannon_entropy,Timepoint,WOMAC_P_Response
OAD-001.pre.stool,OAD001,pre,5.097506,pre,No response
OAD-001.post.stool,OAD001,post,5.838205,post,No response
OAD-003.pre.stool,OAD003,pre,5.190428,pre,No response
OAD-003.post.stool,OAD003,post,4.67964,post,No response
OAD-004.pre.stool,OAD004,pre,5.29956,pre,No response


In [11]:
# specific taxa profiling
GL_to_otu = {}

groups = ['saliva_adh', 'stool_adh']
levels = ['6','7']
for g in groups:
    GL_to_otu[g] = {}
    for l in levels:
        # read in csv
        df_otu = pd.read_csv(path + 'outputs/Qiime2_' + g + '/level-' + l + '.csv', index_col=0)

        # filter on OTU only 
        keep = []
        for x in df_otu.columns:
            if 'd__' in x:
                keep.append(x)
        df_otu = df_otu.loc[:,keep]

        # normalize by dividing by row sum
        df_otu = df_otu.div(df_otu.sum(axis=1),axis=0)

        # save to dict
        GL_to_otu[g]['L' + l] = df_otu

                                       
GL_to_otu['stool_adh']['L6'].head()

Unnamed: 0_level_0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola_A_858004,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Agathobacter_164117,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_H,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__Paramuribaculum,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Gemmiger_A_73129,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A_871400,...,d__Bacteria;p__Firmicutes_A;__;__;__;__,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Micrococcaceae;g__Nesterenkonia,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;__;__,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Peptostreptococcales;f__Anaerovoracaceae;g__S5-A14a,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Clostridiales;f__Clostridiaceae_222000;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Azospirillales_507929;f__Azospirillaceae_507917;g__Azospirillum,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Erysipelotrichales;f__Coprobacillaceae;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas_L_486704,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;__,d__Bacteria;p__Actinobacteriota;__;__;__;__
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD-001.post.stool,0.014084,0.039046,0.022027,0.001515,0.173283,0.0,0.02123,0.00016,0.152452,0.033001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-001.pre.stool,0.022576,0.007098,0.005128,0.0,0.292202,0.001994,0.013555,0.0,0.000783,0.026849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-003.post.stool,0.273829,0.067421,0.184572,0.026929,0.053673,0.037783,0.009619,0.023935,0.005789,0.01206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-003.pre.stool,0.181156,0.067844,0.150895,0.063229,0.05136,0.0556,0.012363,0.051513,0.006045,0.025918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-004.post.stool,0.174773,0.040785,0.0,0.001855,0.124402,0.0,0.0014,0.000371,0.014265,0.090594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Metabolome testing
gdt_to_df = {}
for g in ['saliva','stool']:
    gdt_to_df[g] = {}
    for d in ['paired_beta']: # just arbitrarily 
        gdt_to_df[g][d] = {}
        df_merge = dfd_to_merge[g + '_adh'][d]
        df_meta = pd.read_csv(path + 'inputs/' + g + '_normalized.csv')

        # split on timepoint
        for t in ['Baseline','After diet']:
            df = df_meta[df_meta['Time'] == t]
            df = df.set_index('Study_ID')
            df.index = df.index.map(lambda x: x.split('_')[0])
            df = df.drop('Time',axis=1)
            gdt_to_df[g][d][t] = df

        gdt_to_df[g][d]['diff'] = gdt_to_df[g][d]['After diet'] - gdt_to_df[g][d]['Baseline']

gdt_to_df[g][d][t].head()

Unnamed: 0_level_0,H_Pyridoxamine,H_Thiamine,H_Melezitose,H_Phosphocholine,H_SN_Glycero_3_Phophocholine,H_Gamma_Valerobetaine,H_N_Acetylneuraminic_acid,H_Pyridoxine,H_N_Acetylneuraminic_Acid,H_N_Acetylmuramic_Acid,...,H_Nonhydroxylated_bile_acid,H_Hydroxydodecanoic_acid,H_Omega_Hydroxydodecanoate,H_Delta_Methyldodecenoic_Acid,H_Hydroxydecanoic_acid,H_Hydroxydecanoate,H_Methylpentanoic_acid,H_Monohydroxylated_bile_acid,H_Palmitoyl_ethanolamide,H_Linoleic_Acid
Study_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VAOAD001,0.312342,-0.312477,0.119853,-0.492876,-0.671376,-0.202952,0.203945,-0.455869,0.352628,0.151464,...,0.054336,-0.330037,-0.886045,-0.314427,-0.322677,-0.650357,-0.361695,0.132794,-0.067464,-0.112641
VAOAD009,0.000428,-0.481251,-0.179337,0.983217,0.167836,0.067823,0.364897,-0.371942,0.545435,-0.007664,...,-0.022614,-0.334936,-0.802118,-0.92947,-0.369578,-0.566431,0.075375,-0.519189,-0.180631,-0.418351
OAD001,-0.285212,-0.556445,-0.651325,0.357509,-0.401875,-0.295787,0.137736,-0.165902,0.11217,0.096791,...,-0.923477,-0.589028,-1.535141,-0.525277,-0.278312,-0.600484,0.019657,-0.576831,-0.901479,-1.089906
VAOAD004,0.140904,1.089055,-0.668077,-1.061336,-0.370602,-0.329379,-0.654931,0.142971,-1.006689,-0.100373,...,-0.354297,0.449897,0.489189,0.413723,0.47425,0.661963,0.245553,0.27929,-0.444319,-0.912164
VAOAD011,-0.346855,1.028262,-0.450323,-0.025939,-0.223029,-0.766374,-0.53841,-1.091423,-0.328522,-0.524686,...,-0.385428,0.419748,0.429119,0.333756,0.448461,0.598846,0.390035,0.287682,-0.532141,-0.75228


In [13]:
# Metabolome testing
gt_to_meta = {}
for g in ['saliva','stool']:
    gt_to_meta[g] = {}
    df_meta = pd.read_csv(path + 'inputs/' + g + '_normalized.csv')

    # split on timepoint
    for t in ['Baseline','After diet']:
        df = df_meta[df_meta['Time'] == t]
        df = df.set_index('Study_ID')
        df.index = df.index.map(lambda x: x.split('_')[0])
        df = df.drop('Time',axis=1)
        gt_to_meta[g][t] = df

    pre_df = gt_to_meta[g]['Baseline'].copy()
    pre_df.index = pre_df.index.map(lambda x: x.replace('D0','D-0'))
    pre_df.index = pre_df.index.map(lambda x: x + '.pre.' + g)
    post_df = gt_to_meta[g]['After diet'].copy()
    post_df.index = post_df.index.map(lambda x: x.replace('D0','D-0'))
    post_df.index = post_df.index.map(lambda x: x + '.post.' + g)
    
    gt_to_meta[g]['diff'] = gt_to_meta[g]['After diet'] - gt_to_meta[g]['Baseline']
    gt_to_meta[g]['all'] = pd.concat([pre_df,post_df])
    

gt_to_meta[g][t].head()

Unnamed: 0_level_0,H_Pyridoxamine,H_Thiamine,H_Melezitose,H_Phosphocholine,H_SN_Glycero_3_Phophocholine,H_Gamma_Valerobetaine,H_N_Acetylneuraminic_acid,H_Pyridoxine,H_N_Acetylneuraminic_Acid,H_N_Acetylmuramic_Acid,...,H_Nonhydroxylated_bile_acid,H_Hydroxydodecanoic_acid,H_Omega_Hydroxydodecanoate,H_Delta_Methyldodecenoic_Acid,H_Hydroxydecanoic_acid,H_Hydroxydecanoate,H_Methylpentanoic_acid,H_Monohydroxylated_bile_acid,H_Palmitoyl_ethanolamide,H_Linoleic_Acid
Study_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VAOAD001,0.312342,-0.312477,0.119853,-0.492876,-0.671376,-0.202952,0.203945,-0.455869,0.352628,0.151464,...,0.054336,-0.330037,-0.886045,-0.314427,-0.322677,-0.650357,-0.361695,0.132794,-0.067464,-0.112641
VAOAD009,0.000428,-0.481251,-0.179337,0.983217,0.167836,0.067823,0.364897,-0.371942,0.545435,-0.007664,...,-0.022614,-0.334936,-0.802118,-0.92947,-0.369578,-0.566431,0.075375,-0.519189,-0.180631,-0.418351
OAD001,-0.285212,-0.556445,-0.651325,0.357509,-0.401875,-0.295787,0.137736,-0.165902,0.11217,0.096791,...,-0.923477,-0.589028,-1.535141,-0.525277,-0.278312,-0.600484,0.019657,-0.576831,-0.901479,-1.089906
VAOAD004,0.140904,1.089055,-0.668077,-1.061336,-0.370602,-0.329379,-0.654931,0.142971,-1.006689,-0.100373,...,-0.354297,0.449897,0.489189,0.413723,0.47425,0.661963,0.245553,0.27929,-0.444319,-0.912164
VAOAD011,-0.346855,1.028262,-0.450323,-0.025939,-0.223029,-0.766374,-0.53841,-1.091423,-0.328522,-0.524686,...,-0.385428,0.419748,0.429119,0.333756,0.448461,0.598846,0.390035,0.287682,-0.532141,-0.75228


In [14]:
# Microbiome testing
gt_to_otu = {}
for g in ['saliva','stool']:
    gt_to_otu[g] = {}
    df_otu = GL_to_otu[g + '_adh']['L6'].copy()

    # split on timepoint
    for t in ['post','pre']:
        df = df_otu[df_otu.index.to_series().str.contains(t)]
        gt_to_otu[g][t] = df

    pre_df = gt_to_otu[g]['pre'].copy()
    pre_df.index = pre_df.index.map(lambda x: x.split('.pre')[0])
    pre_df.index = pre_df.index.map(lambda x: x.replace('-',''))
    post_df = gt_to_otu[g]['post'].copy()
    post_df.index = post_df.index.map(lambda x: x.split('.post')[0])
    post_df.index = post_df.index.map(lambda x: x.replace('-',''))
    
    gt_to_otu[g]['diff'] = post_df - pre_df
    gt_to_otu[g]['all'] = pd.concat([gt_to_otu[g]['pre'],gt_to_otu[g]['post']])

gt_to_otu[g][t].head()

Unnamed: 0_level_0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola_A_858004,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Agathobacter_164117,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_H,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__Paramuribaculum,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Gemmiger_A_73129,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A_871400,...,d__Bacteria;p__Firmicutes_A;__;__;__;__,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Micrococcaceae;g__Nesterenkonia,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;__;__,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Peptostreptococcales;f__Anaerovoracaceae;g__S5-A14a,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Clostridiales;f__Clostridiaceae_222000;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Azospirillales_507929;f__Azospirillaceae_507917;g__Azospirillum,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Erysipelotrichales;f__Coprobacillaceae;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas_L_486704,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;__,d__Bacteria;p__Actinobacteriota;__;__;__;__
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD-001.pre.stool,0.022576,0.007098,0.005128,0.0,0.292202,0.001994,0.013555,0.0,0.000783,0.026849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-003.pre.stool,0.181156,0.067844,0.150895,0.063229,0.05136,0.0556,0.012363,0.051513,0.006045,0.025918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-004.pre.stool,0.170334,0.135468,0.0,0.002088,0.173617,0.000143,0.0,0.003649,0.008765,0.054802,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-005.pre.stool,0.040108,0.139385,0.0,0.018806,0.15529,0.037507,0.024457,0.066852,0.008969,0.018328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-006.pre.stool,0.037219,0.167448,0.0,0.00794,0.019591,0.0,0.011006,0.001275,0.012769,0.162962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# specific pathway profiling
g_to_path = {}

groups = ['saliva_adh', 'stool_adh']
g_to_job = {
    'saliva_adh': 'jobs07',
    'stool_adh': 'jobs08'
}
for g in groups:
    # read in csv
    df_path = pd.read_csv(path + 'outputs/' + g_to_job[g] + '/pred_metagenome_unstrat_kegg.tsv', sep ='\t', index_col=0)

    # transpose
    df_path = df_path.T
    
    # normalize by dividing by row sum
    df_path = df_path.div(df_path.sum(axis=1),axis=0)

    # save to dict
    g_to_path[g] = df_path
                                 
g_to_path['saliva_adh'].head()

KEGG_Pathway,Metabolism,Metabolism|Carbohydrate metabolism,Metabolism|Carbohydrate metabolism|Glycolysis / Gluconeogenesis,Metabolism|Carbohydrate metabolism|Pyruvate metabolism,Metabolism|Lipid metabolism,Metabolism|Lipid metabolism|Fatty acid degradation,Metabolism|Amino acid metabolism,Metabolism|Amino acid metabolism|Tyrosine metabolism,Metabolism|Metabolism of cofactors and vitamins,Metabolism|Metabolism of cofactors and vitamins|Retinol metabolism,...,Metabolism|Biosynthesis of other secondary metabolites|Staurosporine biosynthesis,Metabolism|Metabolism of terpenoids and polyketides|Type I polyketide structures,"Metabolism|Metabolism of terpenoids and polyketides|Biosynthesis of 12-, 14- and 16-membered macrolides",Cellular Processes|Cellular community - eukaryotes|Signaling pathways regulating pluripotency of stem cells,Environmental Information Processing|Signal transduction|Plant hormone signal transduction,Metabolism|Metabolism of terpenoids and polyketides|Biosynthesis of enediyne antibiotics,Metabolism|Xenobiotics biodegradation and metabolism|Furfural degradation,Metabolism|Biosynthesis of other secondary metabolites|Biosynthesis of various alkaloids,Organismal Systems|Endocrine system|Relaxin signaling pathway,Human Diseases|Endocrine and metabolic disease|AGE-RAGE signaling pathway in diabetic complications
OAD-001.post.saliva,0.155479,0.025863,0.001866,0.003177,0.008013,0.000273,0.026699,0.000547,0.031793,4.6e-05,...,2.433861e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.216931e-07,0.0,0.0
OAD-001.pre.saliva,0.159679,0.02706,0.001949,0.003082,0.007239,0.000166,0.02747,0.000309,0.034357,4.5e-05,...,1.911111e-06,0.0,0.0,0.0,0.0,0.0,0.0,1.53915e-07,0.0,0.0
OAD-003.post.saliva,0.151338,0.027415,0.002223,0.002702,0.009135,0.00018,0.021911,0.000261,0.030358,4.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-003.pre.saliva,0.152707,0.026974,0.002172,0.002788,0.008887,0.000176,0.022846,0.000257,0.031226,4.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD-004.post.saliva,0.159885,0.027003,0.001989,0.002953,0.007319,0.000132,0.026641,0.0002,0.034499,3.8e-05,...,6.002574e-07,0.0,0.0,0.0,0.0,0.0,8.594595e-07,0.0,0.0,0.0


In [16]:
# Pathway testing
gt_to_path = {}
for g in ['saliva','stool']:
    gt_to_path[g] = {}
    df_path = g_to_path[g + '_adh'].copy()

    # split on timepoint
    for t in ['post','pre']:
        df = df_path[df_path.index.to_series().str.contains(t)]
        gt_to_path[g][t] = df

    pre_df = gt_to_path[g]['pre'].copy()
    pre_df.index = pre_df.index.map(lambda x: x.split('.pre')[0])
    pre_df.index = pre_df.index.map(lambda x: x.replace('-',''))
    post_df = gt_to_path[g]['post'].copy()
    post_df.index = post_df.index.map(lambda x: x.split('.post')[0])
    post_df.index = post_df.index.map(lambda x: x.replace('-',''))
    
    gt_to_path[g]['diff'] = post_df - pre_df
    gt_to_path[g]['all'] = pd.concat([gt_to_path[g]['pre'],gt_to_path[g]['post']])

gt_to_path[g][t].head()

KEGG_Pathway,Metabolism,Metabolism|Carbohydrate metabolism,Metabolism|Carbohydrate metabolism|Glycolysis / Gluconeogenesis,Metabolism|Carbohydrate metabolism|Pyruvate metabolism,Metabolism|Lipid metabolism,Metabolism|Lipid metabolism|Fatty acid degradation,Metabolism|Amino acid metabolism,Metabolism|Amino acid metabolism|Tyrosine metabolism,Metabolism|Metabolism of cofactors and vitamins,Metabolism|Metabolism of cofactors and vitamins|Retinol metabolism,...,Human Diseases|Infectious disease: bacterial|Bacterial invasion of epithelial cells,Human Diseases|Infectious disease: bacterial|Yersinia infection,Metabolism|Biosynthesis of other secondary metabolites|Staurosporine biosynthesis,Cellular Processes|Cellular community - eukaryotes|Signaling pathways regulating pluripotency of stem cells,Environmental Information Processing|Signal transduction|Plant hormone signal transduction,Metabolism|Metabolism of terpenoids and polyketides|Type I polyketide structures,Metabolism|Xenobiotics biodegradation and metabolism|Furfural degradation,Environmental Information Processing|Signal transduction|Rap1 signaling pathway,Metabolism|Metabolism of terpenoids and polyketides|Tetracycline biosynthesis,Metabolism|Biosynthesis of other secondary metabolites|Biosynthesis of various alkaloids
OAD-001.pre.stool,0.159285,0.035029,0.00232,0.003018,0.007937,0.000192,0.027626,0.000229,0.026349,1.2e-05,...,8e-06,6.262338e-06,0.0,1.063603e-06,0.0,0.0,1.510913e-06,0.0,0.0,0.0
OAD-003.pre.stool,0.161049,0.033134,0.002184,0.002538,0.008606,0.00018,0.029092,0.000194,0.027987,1.9e-05,...,1.1e-05,2.013513e-07,0.0,0.0,0.0,0.0,8.511851e-07,0.0,0.0,0.0
OAD-004.pre.stool,0.16176,0.034016,0.002254,0.002773,0.008658,0.000157,0.029185,0.000173,0.027522,1.6e-05,...,1e-06,7.082122e-07,0.0,0.0,3.183607e-07,0.0,6.100744e-07,0.0,0.0,0.0
OAD-005.pre.stool,0.158015,0.033087,0.002185,0.002702,0.008593,0.000186,0.028749,0.000216,0.027538,1.6e-05,...,1.4e-05,1.593033e-06,1.103744e-07,3.679146e-08,0.0,0.0,5.29797e-07,0.0,0.0,0.0
OAD-006.pre.stool,0.157791,0.03221,0.002112,0.002819,0.008727,0.000199,0.030094,0.00022,0.026678,1.3e-05,...,2e-06,0.0,0.0,0.0,0.0,0.0,1.127712e-07,0.0,0.0,0.0


In [17]:
# complete block for paired testing in r v nr and non paired testing pre and post

dcgr_to_sig = {}

for dt in ['taxa', 'path']:
    print(dt)
    dcgr_to_sig[dt] = {}
    for c in ['pvp', 'rvnr']: # pvp is pre AND post, comparing rvnr at pre and post time points
        print(c)
        dcgr_to_sig[dt][c] = {}        
    
        # define df arrays
        xs = []
        gs = []
        ws = []
        ps = []
        os = []
        dirs = []
        adj_rs = []
        
        # define analysis specific params
        if c == 'pvp':
            outer_loop = ['pre', 'post']
            comp = ['Response', 'No response']
            var = 'WOMAC_P_Response'
            drop = 'Timepoint'
        elif c == 'rvnr': # pvp is pre and post, comparing rvnr at pre and posttime points
            outer_loop = ['Response', 'No response']
            comp = ['pre', 'post']
            var = 'Timepoint'
            drop = 'WOMAC_P_Response'

        # for each sample type
        for g in ['saliva','stool']:
            print(g)
            dcgr_to_sig[dt][c][g] = {}
            
            # grab relevant table table
            if dt == 'taxa':
                df_abun = gt_to_otu[g]['all']
            elif dt == 'path':
                df_abun = gt_to_path[g]['all']

            df_map = type_to_df_map[g].copy()
            df_map = df_map.set_index('Together')
            if g == 'stool':
                df_map.index = df_map.index.map(lambda x: x.split('.guma')[0])
            df_map = df_map[['Timepoint','WOMAC_P_Response']]

            for r in outer_loop:
                df_merge = pd.concat([df_abun, df_map[df_map[drop] == r]], axis=1)
                df_merge = df_merge.dropna()
                df_merge['SampleID'] = df_merge.index.map(lambda x: x.split('.p')[0])
                if c == 'rvnr':
                    df_vc = df_merge['SampleID'].value_counts()
                    keep = df_vc[df_vc == 2].index.values
                    df_merge = df_merge[df_merge['SampleID'].isin(keep)] 
                df_merge = df_merge.drop(['SampleID',drop],axis=1)
                sig = []
                for x in list(df_merge.columns.values)[:-1]:
                    df_a = df_merge[df_merge[var] == comp[0]][x]
                    df_b = df_merge[df_merge[var] == comp[1]][x]

                    try:
                        if c == 'rvnr':
                            w,p = scipy.stats.wilcoxon(df_a.values, df_b.values, nan_policy='omit')
                        elif c == 'pvp':
                            w,p = scipy.stats.mannwhitneyu(df_a.values, df_b.values, nan_policy='omit')

                        a = df_a.values
                        b = df_b.values
                        # if median in A is greater
                        abr, abp = scipy.stats.mannwhitneyu(a,b,nan_policy='omit')
                        # if median in B is greater
                        bar, bap = scipy.stats.mannwhitneyu(b,a,nan_policy='omit')
                        if abr > bar:
                            dirs.append(comp[0])
                            adj_rs.append(abr)
                        elif abr < bar:
                            dirs.append(comp[1])
                            adj_rs.append(bar * -1)
                        else:
                            dirs.append('same')
                            adj_rs.append(abr)

                    except:
                        w,p = 0, 1
                        dirs.append(np.nan)
                        adj_rs.append(np.nan)

                    os.append(r)
                    xs.append(x)
                    gs.append(g)
                    ws.append(w)
                    ps.append(p)

                    if p < 0.05:
                        ax = sns.boxplot(data=df_merge, y=x, x=var)
                        sns.swarmplot(data=df_merge, y=x, x=var, palette='dark:grey')
                        sns.despine()
                    
                        plt.tight_layout()
                        plt.savefig(path + 'outputs/jobs06/' + c + '_' + dt + '_barplot_' + g + '_' + r + '_' + x.split('g__')[-1] + '.pdf')
                        plt.close() 
                        sig.append(x)
                print('Number of sig var')
                print(len(sig))
                dcgr_to_sig[dt][c][g][r] = sig
                
        df_res = pd.DataFrame({'SampleType':gs, # saliva v stool
                      'fixed': os,
                      'var': xs, # example pathway or taxa
                      'direction': dirs, # direction of enrichment
                      'Statistic': ws, 
                      'pvalue': ps,
                      'adjusted': adj_rs                      
                     })
        df_res[df_res['pvalue'] < 0.05].to_csv(path + 'outputs/jobs06/' + dt + '_' + c + '.tsv', sep='\t')
df_merge.head()


taxa
pvp
saliva
Number of sig var
2
Number of sig var
5
stool
Number of sig var
6
Number of sig var
4
rvnr
saliva
Number of sig var
9
Number of sig var
0
stool
Number of sig var
6
Number of sig var
0
path
pvp
saliva
Number of sig var
3
Number of sig var
6
stool
Number of sig var
6
Number of sig var
11
rvnr
saliva
Number of sig var
32
Number of sig var
0
stool
Number of sig var
22
Number of sig var
0


Unnamed: 0,Metabolism,Metabolism|Carbohydrate metabolism,Metabolism|Carbohydrate metabolism|Glycolysis / Gluconeogenesis,Metabolism|Carbohydrate metabolism|Pyruvate metabolism,Metabolism|Lipid metabolism,Metabolism|Lipid metabolism|Fatty acid degradation,Metabolism|Amino acid metabolism,Metabolism|Amino acid metabolism|Tyrosine metabolism,Metabolism|Metabolism of cofactors and vitamins,Metabolism|Metabolism of cofactors and vitamins|Retinol metabolism,...,Human Diseases|Infectious disease: bacterial|Yersinia infection,Metabolism|Biosynthesis of other secondary metabolites|Staurosporine biosynthesis,Cellular Processes|Cellular community - eukaryotes|Signaling pathways regulating pluripotency of stem cells,Environmental Information Processing|Signal transduction|Plant hormone signal transduction,Metabolism|Metabolism of terpenoids and polyketides|Type I polyketide structures,Metabolism|Xenobiotics biodegradation and metabolism|Furfural degradation,Environmental Information Processing|Signal transduction|Rap1 signaling pathway,Metabolism|Metabolism of terpenoids and polyketides|Tetracycline biosynthesis,Metabolism|Biosynthesis of other secondary metabolites|Biosynthesis of various alkaloids,Timepoint
OAD-001.pre.stool,0.159285,0.035029,0.00232,0.003018,0.007937,0.000192,0.027626,0.000229,0.026349,1.2e-05,...,6.262338e-06,0.0,1e-06,0.0,0.0,1.510913e-06,0.0,0.0,0.0,pre
OAD-003.pre.stool,0.161049,0.033134,0.002184,0.002538,0.008606,0.00018,0.029092,0.000194,0.027987,1.9e-05,...,2.013513e-07,0.0,0.0,0.0,0.0,8.511851e-07,0.0,0.0,0.0,pre
OAD-004.pre.stool,0.16176,0.034016,0.002254,0.002773,0.008658,0.000157,0.029185,0.000173,0.027522,1.6e-05,...,7.082122e-07,0.0,0.0,3.183607e-07,0.0,6.100744e-07,0.0,0.0,0.0,pre
VAOAD-001.pre.stool,0.163414,0.036621,0.002214,0.002779,0.008978,0.000195,0.028735,0.000203,0.026849,2.7e-05,...,1.303797e-07,0.0,0.0,0.0,0.0,1.0265e-06,0.0,0.0,1.425694e-07,pre
VAOAD-012.pre.stool,0.164662,0.036528,0.002363,0.002908,0.009032,0.00017,0.028994,0.000194,0.027414,1.3e-05,...,0.0,0.0,0.0,3.141516e-07,0.0,3.499616e-07,0.0,0.0,0.0,pre


In [83]:
# horizontal barplot for features

d_to_df = {
    'meta': gt_to_meta,
    'otu': gt_to_otu,
    'path': gt_to_path
}
d_to_label = {
    'meta': 'Metabolite',
    'otu': 'Taxa',
    'path': 'Pathway'
}
t = 'all'

gd_to_res = {}
for g in ['stool','saliva']:
    print(g)
    gd_to_res[g] = {}
    for d in ['meta','otu','path']:
        print(d)
        # grab original dfs
        df_feat = d_to_df[d][g][t]

        df_map = type_to_df_map[g].set_index('Together')
        df_map.index = df_map.index.map(lambda x: x.split('.guma')[0])
        
        df_map = df_map[['Timepoint','WOMAC_P_Response','HostSubjectId']]
        
        # merge df
        df_merge = pd.concat([df_feat, df_map],axis=1)
        df_merge = df_merge.dropna()
        
        # do paired test
        # grab all sample IDs that are duplicated
        df_vc = df_merge['HostSubjectId'].value_counts() == 2
        df_vc = df_vc[df_vc == True]
        keep_id = df_vc.index.values
        
        df_merge_complete = df_merge[df_merge['HostSubjectId'].isin(keep_id)]

        for keep in [['Response','No response'],['Response'],['No response']]:
            df_merge = df_merge_complete[df_merge_complete['WOMAC_P_Response'].isin(keep)]
        
            # samples are in order when subsetted on Timepoint
            df_pre = df_merge[df_merge['Timepoint'] == 'pre'].drop(['Timepoint','WOMAC_P_Response','HostSubjectId'], axis=1)
            df_post = df_merge[df_merge['Timepoint'] == 'post'].drop(['Timepoint','WOMAC_P_Response','HostSubjectId'], axis=1)
            
            # compute rank sum
            feat = []
            rs = []
            adj_rs = []
            ps = []
            dir = []
            for x in df_pre.columns.values:
                try:
                    r, p = scipy.stats.wilcoxon(df_post[x].values-df_pre[x].values,nan_policy='omit')
                except ValueError:
                    r, p = 0, 1
                if d == 'otu':
                    feat.append(x.split('__')[-1])
                elif d == 'path':
                    feat.append(x.split('|')[-1])
                else:
                    feat.append(x)
                # rs.append(r)
                ps.append(p)
                # determine direction via MWU
                a = df_post[x].values
                b = df_pre[x].values
                abr, abp = scipy.stats.mannwhitneyu(a,b,nan_policy='omit')
                bar, bap = scipy.stats.mannwhitneyu(b,a,nan_policy='omit')
                if abr > bar:
                    dir.append('post')
                    adj_rs.append(abr)
                    rs.append(bar)
                elif abr < bar:
                    dir.append('pre')
                    adj_rs.append(bar * -1)
                    rs.append(bar)
                else:
                    dir.append('same')
                    adj_rs.append(abr)
                    rs.append(abr)
            # compile results df
            df_results = pd.DataFrame({d: feat, 'test_stat': rs, 'adj_test_stat': adj_rs,'pvals': ps, 'direction': dir})
            gd_to_res[g][d] = df_results
            df = df_results[df_results['pvals'] < 0.05]
            df = df.sort_values(by=['adj_test_stat'],ascending=False)
            # plot
            # create plot
            try:
                plt.figure(figsize=(8,6))
                sns.set_theme(style="whitegrid")
                
                sns.set_color_codes("pastel")
                sns.barplot(x='adj_test_stat', y=d, data=df,
                            hue="direction",orient='horizontal')#,native_scale=True)#width=1)
                
                ax.set_ylabel(d_to_label[d],fontsize=16)
                ax.set_xlabel("Effect Size / Adjusted Test Statistic",fontsize=16)
                ax.tick_params(labelsize=16)
                sns.despine(left=True, bottom=True)
                plt.tight_layout()
                plt.title(g + '_' + d + '_' + str(keep))
                plt.savefig(path + 'outputs/jobs18/DA_hbarplot_' + g + '_' + d + '_' + str(keep) + '.pdf')
                plt.close()
                df.to_csv(path + 'outputs/jobs18/' + g + '_' + d + '_' + str(keep) + '.tsv', sep='\t')
            
            except: 
                print('no plot for :' + g + '_' + d + '_' + str(keep))
df.head()

stool
meta
otu
no plot for :stool_otu_['No response']
path
no plot for :stool_path_['No response']
saliva
meta
no plot for :saliva_meta_['No response']
otu
no plot for :saliva_otu_['No response']
path
no plot for :saliva_path_['No response']


Unnamed: 0,path,test_stat,adj_test_stat,pvals,direction


<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

In [101]:
# compute mediation
for g in ['stool','saliva']:
    df_M = pd.read_csv(path + 'inputs/cutie_df_meta_' + g + '.tsv', sep='\t', index_col=0)
    df_M.iloc[:,:-1].to_csv(path + 'inputs/mediation_meta_' + g + '_post_test.txt', sep='\t')
    
    df_Y = df_meta_paired[['VAS_Pt_diff', 'WOMAC_pain_diff']]
    df_Y.to_csv(path + 'inputs/mediation_WMdiff_test.txt', sep='\t')
    
    df_X = pd.read_csv(path + 'inputs/cutie_df_taxa_' + g + '.tsv', sep='\t', index_col=0)
    df_X.index = df_X.index.map(lambda x: x.split('.post')[0].replace('-',''))
    df_X.to_csv(path + 'inputs/mediation_taxa_' + g + '_post_test.txt', sep='\t')
df_X.head()

Unnamed: 0_level_0,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus,d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Veillonellales;f__Veillonellaceae;g__Veillonella_A,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales_A_737866;f__Pasteurellaceae;g__Haemophilus_D_735815,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales_597441;f__Neisseriaceae_563222;g__Neisseria_563205,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Micrococcaceae;g__Rothia,d__Bacteria;p__Fusobacteriota;c__Fusobacteriia;o__Fusobacteriales_993521;f__Fusobacteriaceae_993521;g__Fusobacterium_C,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Porphyromonas_A_859423,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Alloprevotella,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Actinomycetaceae;g__Pauljensenia,...,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;__,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Steroidobacterales;f__Steroidobacteraceae;g__,d__Bacteria;p__Planctomycetota;c__Planctomycetia;o__Isosphaerales;f__Isosphaeraceae;g__Tautonia,d__Bacteria;p__Planctomycetota;c__Planctomycetia;o__Planctomycetales;f__Planctomycetaceae;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Kordiimonadaceae_482971;g__Kordiimonas_482971,d__Bacteria;p__Actinobacteriota;c__Thermoleophilia;o__Gaiellales;f__Gaiellaceae;g__GMQP-bins7,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales_A_737866;f__Vibrionaceae;g__Photobacterium,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Nitriliruptorales;f__Nitriliruptoraceae;__,d__Bacteria;p__Desulfobacterota_I;c__Desulfovibrionia;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio_R_446353,d__Archaea;p__Methanobacteriota_A_1229;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter_A
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD001,0.095006,0.106939,0.061089,0.036252,0.224384,0.09251,0.047957,0.001719,0.004767,0.00454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD003,0.321228,0.082309,0.229731,0.201108,0.016364,0.010622,0.012517,0.037379,0.021475,0.004766,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD004,0.291991,0.316705,0.027804,0.130865,0.00666,0.012129,0.023296,0.010813,0.021043,0.011446,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD005,0.189082,0.179829,0.053909,0.248901,0.003899,0.100204,0.004054,0.0,0.007489,0.031225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD006,0.164564,0.037695,0.112056,0.089802,0.116883,0.007145,0.01483,0.0466,0.150343,0.009637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
# Mediation analyses
# check for int_ba
int_ba = {'stool': ['L_methionine', 'Pentahydroxylated_bile_acid', 'Dihydroxylated_bile_acid',
                    'Trihydroxylated_bile_acid', 'Glycocholic_acid', 'Glycocholate',
                    'Glycocholic_acid', 'Tetrahydroxylated_bile_acid', 'Glycochenodeoxycholate',
                    'Deoxycholate', 'Hydroxyhexadecanoyl_lysine', 'Deoxycholic_acid'
                    'Trihydroxycholestanoic_acid', 'Phenylalanine_conjugated_chenodeoxycholic_acid', 'Nonhydroxylated_bile_acid',
                    'Monohydroxylated_bile_acid'],
          'saliva': ['Trihydroxylated_bile_acid', 'Palmitoylcarnitine', 'Palmitoyl_ethanolamide']
         }

# filter on CUTIE results
g_to_cutiejob = {
    'saliva': 'jobs15',
    'stool': 'jobs16'
}

# figure out cutie plots
#X = list(df_X.columns.values)
#M = list(df_M.columns.values)
#x,m = df.iloc[0,0:2].values
#X.index(x), M.index(m)

# define outcome
y = 'WOMAC_pain_diff'



for g in ['stool']: #'saliva']
    # import mediation results
    df_med = pd.read_csv(path + 'outputs/jobs19/20240913/outputs/mediation_taxa_' + g + '_post_mediation_meta_' + g + '_post_mediation_WMdiff_test/mediation_results_all.tsv', 
                         sep='\t')

    for t in ['partial','complete']:
        # copy df
        df = df_med.copy()
        
        # filter on significant total
        df = df[df['total_effect_p'] < 0.05]

        if t == 'complete':
            # find complete mediations
            df = df[df['direct_effect_p'] > 0.05]

        elif t == 'partial':
            # find partial mediations
            df = df[df['direct_effect_p'] < 0.05]
        
        # check how many indirect; this reduces it by alot again
        df = df[df['indirect_effect_p'] < 0.05]

        # create column for metabolites
        df = df[df['M'].isin(['H_' + x for x in int_ba[g]])]
        
        # grab CUTIE results
        df_qt = pd.read_csv(path + 'outputs/' + g_to_cutiejob[g] + '/data_processing/summary_df_resample_1.txt', sep='\t')
        df_qt = df_qt[df_qt['class'] == 'TP']

        for r in range(len(df)):
            x,m = df.iloc[r,0:2].values
            
            # check if it's in CUTIE
            df_q = df_qt.copy()
            # df_q1 = df_q[df_q['var1'] == m]
            df_q2 = df_q[df_q['var2'] == x]
        
            if len(df_q2) > 0: # len(df_q1) > 0 or len(df_q2) > 0:
                # print('plotting ' + m + ' and ' + x)
                # create plotting df
                df_plot = pd.concat([df_X,df_M,df_Y],axis=1)
                # path + 'outputs/jobs19/plots/'
        
                plt.figure(figsize=(8,6))
                sns.set_theme(style="whitegrid")
                # sns.scatterplot(data=df_plot,x=x,y=m,hue=y)
                sns.scatterplot(data=df_plot,x=x,y=y,hue=m,palette='coolwarm')#,size=6)
                plt.tight_layout()
                sns.despine()
                plt.savefig(path + 'outputs/jobs19/plots/' + g + '/' + x.split(';g__')[-1] + '_' + m.replace('/','fslash') + '_' + t + '_MX.pdf')
                plt.close()
        
                plt.figure(figsize=(8,6))
                sns.set_theme(style="whitegrid")        
                sns.scatterplot(data=df_plot,x=x,y=m,hue=m,palette='coolwarm')#,size=6)
                sns.despine()
                plt.tight_layout()
                plt.savefig(path + 'outputs/jobs19/plots/' + g + '/' + x.split(';g__')[-1] + '_' + m.replace('/','fslash') + '_' + t + '_YMX.pdf')
                plt.close()
        
df.head()


Unnamed: 0,X,M,Y,split_value,total_effect,total_effect_p,total_effect_ci_lower,total_effect_ci_upper,direct_effect,direct_effect_p,...,b2_effect_ci_lower,b2_effect_ci_upper,b3_effect,b3_effect_p,b3_effect_ci_lower,b3_effect_ci_upper,X_sparsity,M_sparsity,Y_sparsity,n
21648,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,H_Phenylalanine_conjugated_chenodeoxycholic_acid,VAS_Pt_diff,split,-45.118556,0.02,-70.120374,0.0,-28.694709,0.14,...,-28.715465,13.258995,-28.694709,0.593195,-144.560389,87.17097,0.923077,0,0.0,13
21737,d__Bacteria;p__Firmicutes_A;c__Clostridia_2584...,H_Phenylalanine_conjugated_chenodeoxycholic_acid,VAS_Pt_diff,split,-1965.94974,0.04,-4451.959464,0.0,1174.776186,0.46,...,-2946.998999,449.707653,1174.776186,0.818154,-9913.964569,12263.516941,0.923077,0,0.0,13
21847,d__Bacteria;p__Firmicutes_A;c__Clostridia_2584...,H_Phenylalanine_conjugated_chenodeoxycholic_acid,VAS_Pt_diff,split,5630.002193,0.02,0.0,9494.0404,3768.546747,0.12,...,-2218.748288,3919.104982,3768.546747,0.623249,-12798.21432,20335.307813,0.923077,0,0.0,13
21857,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,H_Phenylalanine_conjugated_chenodeoxycholic_acid,VAS_Pt_diff,split,-4597.425287,0.0,-6509.363395,0.0,-2923.891944,0.34,...,-2926.006824,1351.045875,-2923.891944,0.593195,-14730.205112,8882.421223,0.923077,0,0.0,13
21902,d__Bacteria;p__Firmicutes_A;c__Clostridia_2584...,H_Phenylalanine_conjugated_chenodeoxycholic_acid,VAS_Pt_diff,split,-33331.333333,0.0,-51375.872254,0.0,-21198.216597,0.16,...,-21213.549476,9795.082591,-21198.216597,0.593195,-106793.987059,64397.553864,0.923077,0,0.0,13


In [17]:
# correlations of taxa of interest with metabolites
gtr_to_otucorr = {}
for g in ['saliva','stool']:
    print(g)
    gtr_to_otucorr[g] = {}
    for time in ['diff','all']:
        print(time)
        gtr_to_otucorr[g][time] = {}
        # grab otu table of differences or of all samples
        df_otu = gt_to_otu[g][time]
        for comp in ['horizontal', 'vertical']:
            if comp == 'vertical':
                gr_to_sigotu = to_sigotu[comp]
                rvs = ['pre', 'post']
            elif comp == 'horizontal':
                gr_to_sigotu = to_sigotu[comp]
                rvs = ['Response', 'No response']
            for rv in rvs: # ['pre', 'post']: # Response No response
                print(rv)
                sigcorr = []
                # grab taxa of interest
                taxa_int = gr_to_sigotu[g][rv]
    
                # merge otu table with metabolites            
                df_merge = pd.concat([df_otu.loc[:,taxa_int], gt_to_meta[g][time]], axis=1)
                meta = gt_to_meta[g][time].columns
                
                # look at taxa int and correlate with metabolites
                # df_merge 
                for t in taxa_int:
                    for m in meta:
                        df_sub = df_merge.loc[:,[t,m]]
                        df = type_to_df_map[g].copy()
                        if time is 'all':
                            df = df.set_index('Together')
                            if g == 'stool':
                                df.index = df.index.map(lambda x: x.split('.guma')[0])
                        elif time is 'diff':
                            df = df[df['Timepoint'] == 'pre']
                        df_sub = pd.concat([df_sub, df['WOMAC_P_Response']],axis=1)
                        df_sub = df_sub.dropna(subset=[t,m])
                        r, p = scipy.stats.spearmanr(df_sub[t].values, df_sub[m].values)
                        if p < 0.05:
                            sigcorr.append([t,m])
                            sns.scatterplot(data=df_sub, x=t, y=m, hue='WOMAC_P_Response')
                            sns.despine()
                        
                            plt.tight_layout()
                            plt.savefig(path + 'outputs/' + g_to_job[g + '_adh'] + '/corr_taxa_' + comp + '_' + time + '_' + t.split(';g__')[-1] + '_' + m.replace('/','fslash') + '.pdf')
                            plt.close()   
                gtr_to_otucorr[g][time][rv] = sigcorr
                print(len(sigcorr))
           


saliva
diff


NameError: name 'to_sigotu' is not defined

In [None]:
# correlations of pathways of interest with metabolites
gtr_to_pathcorr = {}
for g in ['saliva','stool']:
    print(g)
    gtr_to_pathcorr[g] = {}
    for time in ['diff','all']:
        print(time)
        gtr_to_pathcorr[g][time] = {}
        # grab otu table of differences or of all samples
        df_path = gt_to_path[g][time]
        for comp in ['horizontal', 'vertical']:
            if comp == 'vertical':
                gr_to_sigpath = to_sigpath[comp]
                rvs = ['pre', 'post']
            elif comp == 'horizontal':
                gr_to_sigpath = to_sigpath[comp]
                rvs = ['Response', 'No response']
            for rv in rvs: # ['pre', 'post']: # Response No response
                print(rv)
                sigcorr = []
                # grab taxa of interest
                taxa_int = gr_to_sigpath[g][rv]
    
                # merge otu table with metabolites            
                df_merge = pd.concat([df_path.loc[:,taxa_int], gt_to_meta[g][time]], axis=1)
                meta = gt_to_meta[g][time].columns
                
                # look at taxa int and correlate with metabolites
                # df_merge 
                for t in taxa_int:
                    for m in meta:
                        df_sub = df_merge.loc[:,[t,m]]
                        df = type_to_df_map[g].copy()
                        if time is 'all':
                            df = df.set_index('Together')
                            if g == 'stool':
                                df.index = df.index.map(lambda x: x.split('.guma')[0])
                        elif time is 'diff':
                            df = df[df['Timepoint'] == 'pre']
                        df_sub = pd.concat([df_sub, df['WOMAC_P_Response']],axis=1)
                        df_sub = df_sub.dropna(subset=[t,m])
                        r, p = scipy.stats.spearmanr(df_sub[t].values, df_sub[m].values)
                        if p < 0.05:
                            sigcorr.append([t,m])
                            sns.scatterplot(data=df_sub, x=t, y=m, hue='WOMAC_P_Response')
                            sns.despine()
                        
                            plt.tight_layout()
                            plt.savefig(path + 'outputs/' + g_to_job[g + '_adh'] + '/corr_path_' + comp + '_' + time + '_' + t.split(';g__')[-1] + '_' + m.replace('/','fslash') + '.pdf')
                            plt.close()   
                gtr_to_pathcorr[g][time][rv] = sigcorr
                print(len(sigcorr))



In [None]:
# L Methionine
# Kaempferol
# Tryptophan
# Tyrosine Stool
# Usnic acid
# Xanthurenic acid
# Tryhydroxycholestenoic acid Stool
# Isofraxadine
int = ['Isofrax', 'Xanth', 'ydroxychol', 'Usnic', 'Tyrosine', 'Tryptophan', 'Kaempferol', 'Methion']

for g in ['stool','saliva']:
    for time in ['diff','all']:
        for r in ['Response', 'No response']:
            test = gtr_to_pathcorr[g][time][r]
            for t in test:
                p, m = t # unpack pathway, metabolite, from tuple representing sig corr
                for i in int:
                    if i in m:
                        print(t)
    

In [None]:
int = ['Isofrax', 'Xanth', 'ydroxychol', 'Usnic', 'Tyrosine', 'Tryptophan', 'Kaempferol', 'Methion']

for g in ['stool','saliva']:
    for time in ['diff','all']:
        for r in ['Response', 'No response']:
            test = gtr_to_otucorr[g][time][r]
            for t in test:
                p, m = t # unpack pathway, metabolite, from tuple representing sig corr
                for i in int:
                    if i in m:
                        print(t)
    

In [46]:
# CUTIE
# In POST samples only
# Color by Resp and Nonresp

for g in ['saliva', 'stool']:

    df_taxa = gt_to_otu[g]['post']
    df_taxa.to_csv(path + 'inputs/cutie_df_taxa_' + g + '.tsv', sep='\t', index_label='SampleID')
    
    df_abun = pd.concat([gt_to_otu[g]['post'], gt_to_path[g]['post']],axis=1)
    df_abun.index = df_abun.index.map(lambda x: x.split('.post')[0].replace('-',''))
    
    df_map = type_to_df_map[g].set_index('Together')
    df_map = df_map[df_map['Timepoint'] == 'post']['WOMAC_P_Response']
    df_map.index = df_map.index.map(lambda x: x.split('.post')[0].replace('-',''))
    
    df_meta = pd.concat([gt_to_meta[g]['After diet'],df_map], axis=1)

    df_abun.to_csv(path + 'inputs/cutie_df_abun_' + g + '.tsv', sep='\t', index_label='SampleID')
    df_meta.to_csv(path + 'inputs/cutie_df_meta_' + g + '.tsv', sep='\t', index_label='SampleID')

df_meta.head()

Unnamed: 0,H_Pyridoxamine,H_Thiamine,H_Melezitose,H_Phosphocholine,H_SN_Glycero_3_Phophocholine,H_Gamma_Valerobetaine,H_N_Acetylneuraminic_acid,H_Pyridoxine,H_N_Acetylneuraminic_Acid,H_N_Acetylmuramic_Acid,...,H_Hydroxydodecanoic_acid,H_Omega_Hydroxydodecanoate,H_Delta_Methyldodecenoic_Acid,H_Hydroxydecanoic_acid,H_Hydroxydecanoate,H_Methylpentanoic_acid,H_Monohydroxylated_bile_acid,H_Palmitoyl_ethanolamide,H_Linoleic_Acid,WOMAC_P_Response
VAOAD001,0.312342,-0.312477,0.119853,-0.492876,-0.671376,-0.202952,0.203945,-0.455869,0.352628,0.151464,...,-0.330037,-0.886045,-0.314427,-0.322677,-0.650357,-0.361695,0.132794,-0.067464,-0.112641,No response
VAOAD009,0.000428,-0.481251,-0.179337,0.983217,0.167836,0.067823,0.364897,-0.371942,0.545435,-0.007664,...,-0.334936,-0.802118,-0.92947,-0.369578,-0.566431,0.075375,-0.519189,-0.180631,-0.418351,No response
OAD001,-0.285212,-0.556445,-0.651325,0.357509,-0.401875,-0.295787,0.137736,-0.165902,0.11217,0.096791,...,-0.589028,-1.535141,-0.525277,-0.278312,-0.600484,0.019657,-0.576831,-0.901479,-1.089906,No response
VAOAD004,0.140904,1.089055,-0.668077,-1.061336,-0.370602,-0.329379,-0.654931,0.142971,-1.006689,-0.100373,...,0.449897,0.489189,0.413723,0.47425,0.661963,0.245553,0.27929,-0.444319,-0.912164,Response
VAOAD011,-0.346855,1.028262,-0.450323,-0.025939,-0.223029,-0.766374,-0.53841,-1.091423,-0.328522,-0.524686,...,0.419748,0.429119,0.333756,0.448461,0.598846,0.390035,0.287682,-0.532141,-0.75228,Response


In [146]:
# anaerobic vs aerobic in response
# bile acids -> list? saliva and stool of interest 
#
# Bile acids found in Stool: 
int_ba = {'stool': ['L_methionine', 'Pentahydroxylated_bile_acid', 'Dihydroxylated_bile_acid',
                    'Trihydroxylated_bile_acid', 'Glycocholic_acid', 'Glycocholate',
                    'Glycocholic_acid', 'Tetrahydroxylated_bile_acid', 'Glycochenodeoxycholate',
                    'Deoxycholate', 'Hydroxyhexadecanoyl_lysine', 'Deoxycholic_acid'
                    'Trihydroxycholestanoic_acid', 'Phenylalanine_conjugated_chenodeoxycholic_acid', 'Nonhydroxylated_bile_acid',
                    'Monohydroxylated_bile_acid'],
          'saliva': ['Trihydroxylated_bile_acid', 'Palmitoylcarnitine', 'Palmitoyl_ethanolamide']
         }

mod_ba = {
    'stool': [],
    'saliva': []
}
full = list(gt_to_meta[g]['After diet'].columns.values)
for g in ['stool','saliva']:
    ba = int_ba[g]
    for b in ba:
        for f in full:
            if b in f:
                mod_ba[g].append(f)

g_to_cutiejob = {
    'saliva': 'jobs15',
    'stool': 'jobs16'
}


# from difflib import SequenceMatcher

for g in ['stool','saliva']:
    print(g)
    n_corr = 0
    # import CUITIE results
    df_qt = pd.read_csv(path + 'outputs/' + g_to_cutiejob[g] + '/data_processing/summary_df_resample_1.txt', sep='\t')
    df_qt = df_qt[df_qt['class'] == 'TP']
    # iterate through rows
    for r in range(len(df_qt)):
        meta = df_qt.iloc[r,0]
        abun = df_qt.iloc[r,1]
        
        #for i in int_ba:
        #    s = SequenceMatcher(None, meta, i)
        #    if s.ratio() > 0.9:
        #        print(i)
        
        #if 'bile' in meta:
        #    print(meta)
        output = [r]
        
        for i in int_ba[g]:
            if i in meta:
                output.append(meta)

        for c in ['horizontal','vertical']:
            if c == 'vertical':
                queries = ['pre', 'post']
            if c == 'horizontal':
                queries = ['Response','No response']
            output.append(c)
            for r in queries:
                int_taxa = to_sigotu[c][g][r]
                for i in int_taxa:
                    if i in abun:
                        output.append('taxa')
                        output.append(i)
                int_path = to_sigpath[c][g][r]
                for i in int_path:
                    if i in abun:
                        output.append('path')
                        output.append(i)
        #output.append()
        #output.append()
        if len(output) > 5:
            # print(output)
            n_corr += 1

    print(n_corr)

# pathway prediction of response vs non response @ baseline
# specific taxa at baseline


# pathways of interest
# ex inflammation in non responders
# heatmap of differential pathways in responders/nonresponders 
# and correlation of those pathways with bacteria
# send list of upregulated pathways 

int_taxa

stool


NameError: name 'to_sigotu' is not defined

In [147]:
# annotations for pathways
int_ba = {'stool': ['L_methionine', 'Pentahydroxylated_bile_acid', 'Dihydroxylated_bile_acid',
                    'Trihydroxylated_bile_acid', 'Glycocholic_acid', 'Glycocholate',
                    'Glycocholic_acid', 'Tetrahydroxylated_bile_acid', 'Glycochenodeoxycholate',
                    'Deoxycholate', 'Hydroxyhexadecanoyl_lysine', 'Deoxycholic_acid'
                    'Trihydroxycholestanoic_acid', 'Phenylalanine_conjugated_chenodeoxycholic_acid', 'Nonhydroxylated_bile_acid',
                    'Monohydroxylated_bile_acid'],
          'saliva': ['Trihydroxylated_bile_acid', 'Palmitoylcarnitine', 'Palmitoyl_ethanolamide']
         }

for g in ['saliva','stool']:
    # int_ba[g] for metabolite (row) annotations
    #df_map = type_to_df_map[g].set_index('Together')
    #df_map = df_map[df_map['Timepoint'] == 'post']['WOMAC_P_Response']
    #df_map.index = df_map.index.map(lambda x: x.split('.post')[0].replace('-',''))
    
    #df_meta = pd.concat([gt_to_meta[g]['After diet'],df_map], axis=1)
    df_meta = gt_to_meta[g]['After diet']
    df_meta_R = df_meta.T
    df_meta_R['var_type'] = df_meta_R.index.map(lambda x: 'BA' if x in mod_ba[g] else 'NonBA')
    df_meta_R = df_meta_R['var_type'].reset_index()
    # df_meta_R['index'] = df_meta_R['index'].str.lower()
    df_meta_R = df_meta_R.sort_values(by='index',ascending=True)
    df_meta_R.to_csv(path + 'inputs/row_' + g + '.tsv', sep='\t')
    
    # dcgr_to_sig for column (taxa) annotations
    df_abun = gt_to_otu[g]['post']
    df_abun.index = df_abun.index.map(lambda x: x.split('.post')[0].replace('-',''))
    
    df_abun_R = df_abun.T
    df_abun_R['var_type'] = df_abun_R.index.map(lambda x: 'Sig' if x in dcgr_to_sig['taxa']['rvnr'][g]['Response'] else 'Nonsig')
    df_abun_R = df_abun_R['var_type'].reset_index()
    #df_abun_R['index'] = df_abun_R['index'].str.lower()
    df_abun_R = df_abun_R.sort_values(by='index',ascending=True)
    df_abun_R.to_csv(path + 'inputs/col_' + g + '.tsv', sep='\t')

df_abun_R.head()

Unnamed: 0,index,var_type
97,d__Archaea;p__Methanobacteriota_A_1229;c__Meth...,Nonsig
199,d__Archaea;p__Methanobacteriota_A_1229;c__Meth...,Nonsig
268,d__Archaea;p__Thermoplasmatota;c__Thermoplasma...,Nonsig
11,d__Bacteria;__;__;__;__;__,Nonsig
373,d__Bacteria;p__Actinobacteriota;__;__;__;__,Nonsig


In [175]:
# play testing mediation
g = 'stool'
df_M = pd.read_csv(path + 'inputs/cutie_df_meta_stool.tsv', sep='\t', index_col=0)
df_M.iloc[:,:-1].to_csv(path + 'inputs/mediation_meta_stool_post_test.txt', sep='\t')

df_Y = df_meta_paired[['VAS_Pt_diff', 'WOMAC_pain_diff']]
df_Y.to_csv(path + 'inputs/mediation_WMdiff_test.txt', sep='\t')

df_X = pd.read_csv(path + 'inputs/cutie_df_taxa_stool.tsv', sep='\t', index_col=0)
df_X.index = df_X.index.map(lambda x: x.split('.post')[0].replace('-',''))
df_X.to_csv(path + 'inputs/mediation_taxa_stool_post_test.txt', sep='\t')
df_X.head()


Unnamed: 0_level_0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola_A_858004,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Agathobacter_164117,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_H,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__Paramuribaculum,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Gemmiger_A_73129,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A_871400,...,d__Bacteria;p__Firmicutes_A;__;__;__;__,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Micrococcaceae;g__Nesterenkonia,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Lactobacillales;__;__,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Peptostreptococcales;f__Anaerovoracaceae;g__S5-A14a,d__Bacteria;p__Firmicutes_A;c__Clostridia_258483;o__Clostridiales;f__Clostridiaceae_222000;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Azospirillales_507929;f__Azospirillaceae_507917;g__Azospirillum,d__Bacteria;p__Firmicutes_D;c__Bacilli;o__Erysipelotrichales;f__Coprobacillaceae;__,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas_L_486704,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;__,d__Bacteria;p__Actinobacteriota;__;__;__;__
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OAD001,0.014084,0.039046,0.022027,0.001515,0.173283,0.0,0.02123,0.00016,0.152452,0.033001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD003,0.273829,0.067421,0.184572,0.026929,0.053673,0.037783,0.009619,0.023935,0.005789,0.01206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD004,0.174773,0.040785,0.0,0.001855,0.124402,0.0,0.0014,0.000371,0.014265,0.090594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD005,0.043945,0.103652,0.0,0.005956,0.135246,0.046674,0.022837,0.168961,0.001596,0.034567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OAD006,0.038928,0.185602,0.0,0.006453,0.020567,0.000106,0.012512,0.000831,0.010729,0.174328,...,0.0,0.0,0.0,0.0,0.0,3e-05,0.0,0.0,0.0,0.0


In [None]:
# for corr heatmaps 
for g in ['saliva','stool']:
    df_abun = pd.concat([gt_to_otu[g]['post'], gt_to_path[g]['post']],axis=1)
    df_abun.index = df_abun.index.map(lambda x: x.split('.post')[0].replace('-',''))
    
    df_map = type_to_df_map[g].set_index('Together')
    df_map = df_map[df_map['Timepoint'] == 'post']['WOMAC_P_Response']
    df_map.index = df_map.index.map(lambda x: x.split('.post')[0].replace('-',''))
    
    df_meta = pd.concat([gt_to_meta[g]['After diet'],df_map], axis=1)

    df_quant_R = df_abun.copy()
    #df_quant_R = pd.concat([df_abun, df_meta],axis=1)# .copy() #([df_quant, df_asv100, df_path_top100], axis=1)
    # df_quant_R = df_quant_R.drop(['WOMAC_P_Response'],axis=1)
    # df_quant_R.to_csv(path + 'inputs/df_quant_' + g + '.tsv', sep='\t')
    df_quant_R = df_quant_R.T
    # this is in the order they were entered i.e. 
    # [df_olink, df_fa, df_acpa_fecal, df_acpa_plasma, df_plasma, df_rbfa, df_mb]
     # ['rbfa']*len(df_rbfa.columns) + \  
    df_quant_R['var_type'] = ['Taxa']*len(gt_to_otu[g]['post'].columns) + \
                             ['Pathways']*len(gt_to_path[g]['post'].columns) # + \
                             # ['Metabolite']*len(gt_to_meta[g]['After diet'].columns)
    # df_quant_R = df_quant_R[~df_quant_R['var_type'].isin(['Metagenomic_ASVs','Metagenomic_Pathways'])]
    df_quant_R = df_quant_R['var_type'].reset_index()
    df_quant_R['index'] = df_quant_R['index'].str.lower()
    df_quant_R = df_quant_R.sort_values(by='index',ascending=True)
    df_quant_R.to_csv(path + 'inputs/df_quant_R_corr_labels_' + g + '.tsv', sep='\t')
    print(len(df_quant_R))
df_quant_R.head()

In [None]:
# 4 way paired boxplot trial
g = 'saliva'
df_alpha = g_to_dfd[g + '_adh']['alpha']

df_map = type_to_df_map[g].copy()
df_map = df_map.set_index('Together')
df_map = df_map[['Timepoint','WOMAC_P_Response']]

df_merge = pd.concat([df_alpha, df_map], axis=1)
df_merge = df_merge.dropna()

sns.boxplot(x='Timepoint', y='shannon_entropy',
            hue='WOMAC_P_Response', palette=["r", "g"],
            data=df_merge)
sns.despine()