In [1]:
# import libraries
import pandas as pd
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# disable warnings, use w caution
import warnings
warnings.filterwarnings('ignore')

# project specific libs
import os
import matplotlib.pyplot as plt
import pathlib

In [2]:
# project specific path
path = '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/'

##### Meta Analyses Setup #####

In [36]:
# define replacements prior to analysis
sub_dict=  {'healthy control': 'HC',
            'healthy': 'HC',
            'Healthy': 'HC',
            'HLT': 'HC',
            'NORA':'RA',
            'SLE-G': 'SLE',
            'axial spondyloarthritis': 'axSpA',
            'sle': 'SLE',
            'ss': 'SjD',
            'psa': 'PsA',
            'pso': 'PsO',
            'nss': 'NSS',
            'pSS patients without treament': 'SjD'
           }

dx_sub_dict = {'host_disease':'Diagnosis'}

jobs = ['jobs09','jobs10','jobs12','jobs13','jobs41','jobs44','jobs45','jobs46','jobs49']


### all data
j_to_res = {
    'jobs09': {
        'Diagnosis': 'RA',
        'Cohort': 'AMPAIM',
        'LEfSe_path': '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/outputs/jobs09/lefse_results.res'
    },
    'jobs10':{
        'Diagnosis': 'PsO',
        'Cohort': 'AMPAIM',
        'LEfSe_path': '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/outputs/jobs10/lefse_results.res'
    },
    'jobs11':{
        'Diagnosis': 'PsA',
        'Cohort': 'AMPAIM',
        'LEfSe_path': '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/outputs/jobs11/lefse_results.res'
    },
    'jobs12':{
        'Diagnosis': 'SjD',
        'Cohort': 'AMPAIM',
        'LEfSe_path': '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/outputs/jobs12/lefse_results.res'
    },
    'jobs13':{
        'Diagnosis': 'SLE',
        'Cohort': 'AMPAIM',
        'LEfSe_path': '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/outputs/jobs13/lefse_results.res'
    },
    'jobs14':{
        'Diagnosis': 'NSS',
        'Cohort': 'AMPAIM',
        'LEfSe_path': '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/outputs/jobs14/lefse_results.res'
    },
    'jobs41':{
        'Diagnosis': 'SLE',
        'Cohort': 'Su2020',
        'LEfSe_path': path + 'outputs/jobs41/Analysis_lefse_0/results/Diagnosis/lefse_results.taxa_table_L7.Diagnosis-HC-or-SLE-G.Diagnosis.NA.tsv'
    },
    'jobs43':{
        'Diagnosis': 'AxSpA',
        'Cohort': 'Gill2022',
        'LEfSe_path': path + 'outputs/jobs43/Analysis_lefse_0/results/Diagnosis/lefse_results.taxa_table_L7.Diagnosis.NA.tsv'
    },
    'jobs44':{
        'Diagnosis': 'SjD',
        'Cohort': 'Wang2022',
        'LEfSe_path': path + 'outputs/jobs44/Analysis_lefse_0/results/Diagnosis/lefse_results.taxa_table_L7.Diagnosis.NA.tsv'
    },
    'jobs45':{
        'Diagnosis': 'PsO',
        'Cohort': 'Luca2024',
        'LEfSe_path': path + 'outputs/jobs45/Analysis_lefse_0/results/Diagnosis/lefse_results.taxa_table_L7.Diagnosis.NA.tsv'
    },
    'jobs46':{
        'Diagnosis': 'RA',
        'Cohort': 'Yu2022',
        'LEfSe_path': path + 'outputs/jobs46/Analysis_lefse_0/results/Diagnosis/lefse_results.taxa_table_L7.Diagnosis.NA.tsv'
    },
    'jobs49':{
        'Diagnosis': 'RA',
        'Cohort': 'Rooney2024',
        'LEfSe_path': path + 'outputs/jobs49/Analysis_lefse_0/results/Diagnosis/lefse_results.taxa_table_L7.Diagnosis.NA.tsv'
    }

}


##### Process LEfSe results prior to UpSetR #####

In [26]:
# for AMPAIM

for j in jobs:
    # grab res dict
    res = j_to_res[j]

    # grab df and name columns
    if '.res' in res['LEfSe_path']:
        header = None
    elif '.taxa' in res['LEfSe_path']:
        header = 0
    
    df_lefse = pd.read_csv(res['LEfSe_path'], sep='\t', header=header, names =['Taxa','X','Direction','LDA','p'])
   
    # drop na
    df_lefse = df_lefse[df_lefse['p'] != '-']
    
    # cast to float
    df_lefse['p'] = df_lefse['p'].astype(float)

    # filter on NA 
    df_lefse = df_lefse[~np.isnan(df_lefse['LDA'])]

    # save
    for d in df_lefse.Direction.unique():    
        df = df_lefse[df_lefse['Direction'] == d] 
        df.to_csv(path + 'outputs/jobs100/' + res['Cohort'] + '_' + res['Diagnosis'] + '_' + d + '.tsv', sep='\t')
        

#df = df[np.isnan(df['LDA'])]
#LDAnan = df.Taxa.values

# print(LDAnan)
print(len(df))



8


##### jobs39 #####

In [27]:
# jobs39 is where the first PRJNA will be tested, 16S sequencing
# PRJNA317370
# we need to generate a Q2 mapping file like in OA
q2_oa = pd.read_csv(path + 'outputs/jobs39/oaq2.tsv', sep='\t')

# take columns i think we need
keep = ['BarcodeSequence','LinkerPrimerSequence']

# grab sample IDs
# df_map = q2_oa.loc[:,keep]

df_sra = pd.read_csv(path + 'outputs/jobs39/SraRunTable.txt', sep='\t')
df_sra = df_sra.rename(columns={'Run':'#SampleID'})

# insert in reverse order
for i in keep:
    df_sra.insert(1, i, ['NA' for _ in range(len(df_sra))])

df_sra.to_csv(path + 'outputs/jobs39/qiime_mapping_file.tsv', sep='\t')
df_sra.columns.values
# df_sra['Sample Name']

array(['#SampleID', 'LinkerPrimerSequence', 'BarcodeSequence',
       'Assay Type', 'AvgSpotLen', 'Bases', 'BioProject', 'BioSample',
       'BioSampleModel', 'Bytes', 'Center Name', 'collection_date',
       'Consent', 'DATASTORE filetype', 'DATASTORE provider',
       'DATASTORE region', 'Experiment', 'geo_loc_name_country',
       'geo_loc_name_country_continent', 'geo_loc_name', 'host',
       'Instrument', 'isolation_source', 'lat_lon', 'Library Name',
       'LibraryLayout', 'LibrarySelection', 'LibrarySource', 'Organism',
       'Platform', 'ReleaseDate', 'create_date', 'version', 'Sample Name',
       'SRA Study', '1-Methylhistidine', '3-Methylhistidine', 'Alanine',
       'allo-Isoleucine', 'alpha-Amino-N-butyric-acid',
       'alpha-Aminoadipic-acid', 'Anserine', 'Arginine', 'Asparagine',
       'Aspartic_Acid', 'beta-Alanine', 'beta-Aminoisobutyric-acid',
       'betaine', 'carnitine', 'Carnosine', 'choline', 'Citrulline',
       'Cystathionine_1', 'Cystathionine_2', 'Cystin

##### jobs44 #####

In [93]:
# create mapping file
df = pd.read_csv(path + 'outputs/jobs44/gutonlyCRR.csv')
df = df[df['Run title'].str.contains('fecal')]
df = df.set_index('Accession')

# get diagnosis column
df['DiagnosisOG'] = df['Run title'].map(lambda x: x.split('sample:')[-1])

# create new simplified one
df = df[df['DiagnosisOG'].isin(['healthy control','pSS patients without treament'])]
# pSS patients without treament            90
# pSS patients with 3-6 month treament     46
# non-pSS patient                          43
# healthy control                          40
# pSS patients with 6-12 month treament    11                       

# create new diagnosis
df['Diagnosis'] = df['DiagnosisOG'].map({'healthy control': 'HC', 'pSS patients without treament': 'SjD'})

# create categorical
df = df.reset_index()
df.loc[-1] = ['categorical' for _ in range(len(df.columns))] # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df.iloc[0,0] = '#q2:types'
df = df.set_index('Accession')
df.index.name = '#SampleID'

# create host subject id
df['HostSubjectId'] = df['ID']

# drop "ID" it is protected in QIIME
df = df.drop('ID',axis=1)

# export
df.to_csv(path + 'outputs/jobs44/qiime_mapping_file.tsv', sep='\t')
df.head()

Unnamed: 0_level_0,Run title,BioProject accession,Experiment accession,Run data file type,Read filename 1,Read file1 MD5,DownLoad Read file1,Read filename 2,Read file2 MD5,DownLoad Read file2,...,Reference file name,MD5 for reference file,Assembly Name or Accession,Assembly Accession URL,other_db,accession_in_other_db,other_db_url,DiagnosisOG,Diagnosis,HostSubjectId
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#q2:types,categorical,categorical,categorical,categorical,categorical,categorical,categorical,categorical,categorical,categorical,...,categorical,categorical,categorical,categorical,categorical,categorical,categorical,categorical,categorical,categorical
CRR442600,D8:Amplicon sequence of human fecal sample:pSS...,PRJCA008752,CRX385346,fastq,CRR442600_f1.fq.gz (12975507 bytes),3149e0d5588d0cea4b58c1a2706c7e43,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,CRR442600_r2.fq.gz (12152647 bytes),038b8b7280f629bba6e6fa50eb5b78f2,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,...,,,,,,,,pSS patients without treament,SjD,8
CRR442603,D11:Amplicon sequence of human fecal sample:pS...,PRJCA008752,CRX385349,fastq,CRR442603_f1.fq.gz (10712320 bytes),854db81c703df4130019c80e717ccad7,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,CRR442603_r2.fq.gz (10213484 bytes),692bcab19cb7f412000af2869de5355f,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,...,,,,,,,,pSS patients without treament,SjD,11
CRR442606,D14:Amplicon sequence of human fecal sample:pS...,PRJCA008752,CRX385352,fastq,CRR442606_f1.fq.gz (11509869 bytes),aa3c4d14208bb22f40b7a1813fe5c4fc,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,CRR442606_r2.fq.gz (11022278 bytes),49cdb6984acb61ce32e31d7851bf9468,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,...,,,,,,,,pSS patients without treament,SjD,14
CRR442608,D16:Amplicon sequence of human fecal sample:pS...,PRJCA008752,CRX385354,fastq,CRR442608_f1.fq.gz (12899732 bytes),6fb9bfd4498e608a33d0bfdcbbf32c48,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,CRR442608_r2.fq.gz (12309020 bytes),8491d2cd4dfe9d639f256b307f459eda,ftp://download.big.ac.cn/gsa/CRA006415/CRR4426...,...,,,,,,,,pSS patients without treament,SjD,16


##### Jobs 45 #####

In [26]:
# jobs45
df = pd.read_csv(path + 'outputs/jobs45/SraRunTable.txt', sep='\t')

# create host subject ID col
df['HostSubjectId'] = df['Run']
df = df.set_index('Run')

# get diagnosis column
df['Diagnosis'] = df['Sample Name'].map(lambda x: 'PsO' if 'PSORI' in x else 'HC')

# create categorical
df = df.reset_index()
df.loc[-1] = ['categorical' for _ in range(len(df.columns))] # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df.iloc[0,0] = '#q2:types'
df = df.set_index('Run')
df.index.name = '#SampleID'

# export
df.to_csv(path + 'outputs/jobs45/qiime_mapping_file.tsv', sep='\t')

# check breakdown
df.Diagnosis.value_counts()

# df.head()

Diagnosis
PsO            39
HC             21
categorical     1
Name: count, dtype: int64

##### jobs46 #####

In [24]:
# jobs46
df = pd.read_csv(path + 'outputs/jobs46/SraRunTable.txt', sep='\t')

# create host subject ID col
df['HostSubjectId'] = df['Run']

# df = df[df['Run title'].str.contains('fecal')]
df = df.set_index('Run')

# get diagnosis column
df['Diagnosis'] = df['Sample Name'].map(lambda x: 'RA' if 'RA' in x else 'HC')

# create categorical
df = df.reset_index()
df.loc[-1] = ['categorical' for _ in range(len(df.columns))] # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df.iloc[0,0] = '#q2:types'
df = df.set_index('Run')
df.index.name = '#SampleID'

# export
df.to_csv(path + 'outputs/jobs46/qiime_mapping_file.tsv', sep='\t')

    
# check 26 HC 26 RA
df.Diagnosis.value_counts()

# df.head()


Diagnosis
RA             26
HC             26
categorical     1
Name: count, dtype: int64

##### jobs47 #####

In [22]:
# jobs47
df = pd.read_csv(path + 'outputs/jobs47/SraRunTable.txt', sep='\t')

# create host subject ID col
df['HostSubjectId'] = df['Run']

# df = df[df['Run title'].str.contains('fecal')]
df = df.set_index('Run')

# get diagnosis column
candidates = ['C' + str(x) for x in range(1,28)] # 27 healthy controls, C1...C27

df['Diagnosis'] = df['Sample Name'].map(lambda x: 'HC' if x.split('_')[0] in candidates else 'PsO')

# create categorical
df = df.reset_index()
df.loc[-1] = ['categorical' for _ in range(len(df.columns))] # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df.iloc[0,0] = '#q2:types'
df = df.set_index('Run')
df.index.name = '#SampleID'

# export
df.to_csv(path + 'outputs/jobs47/qiime_mapping_file.tsv', sep='\t')

# check
df.Diagnosis.value_counts()

# df.head()

Diagnosis
PsO            55
HC             27
categorical     1
Name: count, dtype: int64

##### jobs48 #####

In [23]:
# jobs48
df = pd.read_csv(path + 'outputs/jobs48/SraRunTable.txt', sep='\t')

# create host subject ID col
df['HostSubjectId'] = df['Run']

# df = df[df['Run title'].str.contains('fecal')]
df = df.set_index('Run')

# get diagnosis column
df['Diagnosis'] = df['Sample Name'].map(lambda x: 'SLE' if 'SF' in x else 'HC')

# create categorical
df = df.reset_index()
df.loc[-1] = ['categorical' for _ in range(len(df.columns))] # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df.iloc[0,0] = '#q2:types'
df = df.set_index('Run')
df.index.name = '#SampleID'

# export
df.to_csv(path + 'outputs/jobs48/qiime_mapping_file.tsv', sep='\t')

# check
df.Diagnosis.value_counts()

# df.head()

Diagnosis
SLE            26
HC             21
categorical     1
Name: count, dtype: int64

##### Jobs49 #####

In [34]:
# jobs49
df = pd.read_csv(path + 'outputs/jobs49/metadata_kb.txt', sep='\t')

# drop random na rows
df = df.dropna()

# dictionary
# CCP   at- risk individual 
# NORA	new onset RA 
# HLT 	Healthy individual
# NG-XXX-XX	Longitudinal study-particiapnt number-timepoint

# create host subject ID col
df['HostSubjectId'] = df['sample_id'].apply(lambda x: x.split('-')[1] if 'NG' in x else x)

# create timepoint column
df['Timepoint'] = df['sample_id'].apply(lambda x: x.split('-')[-1] if 'NG' in x else 1)
                                                            
# probably will keep only first timepoint
df = df.drop_duplicates(subset='HostSubjectId', keep='first', inplace=False)

# # get diagnosis column
df['Diagnosis'] = df['Group']

# create categorical
df = df.reset_index()
df.loc[-1] = ['categorical' for _ in range(len(df.columns))] # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df.iloc[0,0] = '#q2:types'
df = df.set_index('sample_id')
df.index.name = '#SampleID'

# export
df.to_csv(path + 'outputs/jobs49/qiime_mapping_file.tsv', sep='\t')

# check
print(df.Diagnosis.value_counts())
# df.sample_id.values
df.head()

Diagnosis
at_risk        124
HLT             22
NORA             8
categorical      1
Name: count, dtype: int64


Unnamed: 0_level_0,index,Group,HostSubjectId,Timepoint,Diagnosis
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
categorical,#q2:types,categorical,categorical,categorical,categorical
13,0,at_risk,13,1,at_risk
168,1,at_risk,168,1,at_risk
172,2,at_risk,172,1,at_risk
188,3,at_risk,188,1,at_risk


##### Alpha ##### 

In [38]:
# decide what to analyze
keep = ['SjD','HC','SLE','SLE-G','RA','PsO']#,'axial spondyloarthritis']
jobs = ['jobs41','jobs44', 'jobs45', 'jobs46','jobs49']

In [39]:
# alpha plots
for alpha_metric in ['shannon_entropy']:#, 'faith_pd']:
    # create list of dfs to concat
    dfs = []

    # for q2 mapping
    df = pd.read_csv(path + 'inputs/Q2_MSQ138_141_noctrl_noeiser_nocd_correct_new/' + alpha_metric + '_metadata.tsv', sep='\t')
    df = df.set_index('id')
    df = df.drop('#q2:types')

    # create cohort and do replacements
    df['cohort'] = 'AMPAIM'
    df = df.replace(sub_dict)

    # keep things only in consideratino
    df = df[df['Diagnosis'].isin(keep)]
        
    dfs.append(df)
    
    for job in jobs:
        # load df
        df = pd.read_csv(path + 'outputs/' + job + '/Analysis_core_pipeline_taxonomic_0/diversity/' + alpha_metric + '_metadata.tsv', sep='\t')
    
        # rename
        df = df.rename(columns={'host_disease':'Diagnosis'})
    
        # replace
        df = df.replace(sub_dict)
        
        # q2 modifications
        df = df.set_index('id')
        df = df.drop('#q2:types')
    
        # set cohort
        df['cohort'] = j_to_res[job]['Cohort']
    
        # append to list
        dfs.append(df)
    
    df_merge = pd.concat(dfs)
    df_merge = df_merge.dropna(how='any',axis=1)
    
    df_merge[alpha_metric] = df_merge[alpha_metric].astype(float)
    
    # filter
    df_merge = df_merge[df_merge['Diagnosis'].isin(keep)]

    # export
    df_merge.to_csv(path + 'outputs/jobs99/df_alpha_' + alpha_metric + '.tsv', sep='\t')

    # draw figure    
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df_merge,x='cohort',y=alpha_metric,hue='Diagnosis',showfliers=False)
    #sns.stripplot(data=df_merge,x='cohort',y='shannon_entropy',hue='Diagnosis',legend=None,dodge=True,color='gray')
    sns.despine()
    plt.savefig(path + 'outputs/jobs99/alpha_meta_' + alpha_metric + '.pdf')
    plt.close()
#  plt.tight_layout()

df_merge.head()

Unnamed: 0_level_0,Diagnosis,shannon_entropy,cohort
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
518-0-twin-psa-plate308,PsO,5.983555,AMPAIM
524-0-twin-psa-plate308,PsO,4.126833,AMPAIM
525-0-twin-psa-plate308,PsO,5.451687,AMPAIM
528-0-twin-psa-plate308,PsO,4.15375,AMPAIM
529-0-twin-psa-plate308,PsO,4.034945,AMPAIM


In [41]:
df_merge[df_merge['cohort'] == 'Rooney2024']

Unnamed: 0,Group 1,Group 2,Sample size,Permutations,pseudo-F,p-value,q-value,cohort,-log10p
25,HC,PsO,35,999,1.001743,0.446,0.446,Rooney2024,0.350665


##### Beta #####

In [40]:
# beta plots
dfs = []
# ampaim
df = pd.read_csv(path + 'inputs/Q2_MSQ138_141_noctrl_noeiser_nocd_correct_new/permanova-pairwise.csv')
df['cohort'] = 'AMPAIM'
df = df.replace(sub_dict)
df = df[df['Group 1'].isin(keep)]
df = df[df['Group 2'].isin(keep)]
df['comparison'] = df['Group 1'] + '_' + df['Group 2']
dfs.append(df)
# validation
for job in jobs:
    print(job)
    # load df
    df = pd.read_csv(path + 'outputs/' + job + '/Analysis_core_pipeline_taxonomic_0/diversity/permanova-pairwise.csv')
    
    # set cohort
    df['cohort'] = j_to_res[job]['Cohort']
    print(j_to_res[job]['Cohort'])

    # replace
    df = df.replace(sub_dict)
    
    # append to list
    dfs.append(df)

df_merge = pd.concat(dfs)
df_merge.index = [i for i in range(len(df_merge))]
df_merge = df_merge.dropna(how='any',axis=1)

# swap group2 HC with group1
# df_merge_final = df_merge.copy()
for i,row in df_merge.iterrows():
    print(i)
    if row.iloc[1] == 'HC':
        print('checking')
        temp = row.iloc[0]
        #print(temp)
        df_merge.iloc[i, 0] = 'HC'
        df_merge.iloc[i, 1] = temp

# df_merge['shannon_entropy'] = df_merge['shannon_entropy'].astype(float)

# filter
#keep = ['PsA','HC','SLE','SLE+G','SLE-G','RA','axial spondyloarthritis']
#df_merge = df_merge[df_merge['Diagnosis'].isin(keep)]

df_merge['-log10p'] = -1 * np.log10(df_merge['p-value'])
df_merge['Group 1'] = df_merge['Group 1'].replace(sub_dict)
df_merge['Group 2'] = df_merge['Group 2'].replace(sub_dict)


df = df_merge[df_merge['Group 1'] == 'HC']


df = df[df['Group 1'].isin(keep)]
df = df[df['Group 2'].isin(keep)]
df['Diagnosis'] = df['Group 2']

# save for R
df.to_csv(path + 'outputs/jobs99/df_beta.tsv', sep='\t')

plt.figure(figsize=(6,4))
sns.barplot(data=df,x='cohort',y='-log10p',hue='Diagnosis')#,showfliers=False)
sns.despine()
plt.savefig(path + 'outputs/jobs99/beta_permanova_pvals.pdf')
plt.close()
#sns.despine()
df.head()

jobs41
Su2020
jobs44
Wang2022
jobs45
Luca2024
jobs46
Yu2022
jobs49
Rooney2024
0
checking
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


Unnamed: 0,Group 1,Group 2,Sample size,Permutations,pseudo-F,p-value,q-value,cohort,-log10p,Diagnosis
0,HC,RA,50,999,1.349725,0.101,0.238,AMPAIM,0.995679,RA
4,HC,PsO,34,999,1.397645,0.082,0.238,AMPAIM,1.086186,PsO
5,HC,SLE,11,999,1.192808,0.208,0.364,AMPAIM,0.681937,SLE
6,HC,SjD,16,999,1.548308,0.053,0.238,AMPAIM,1.275724,SjD
11,HC,SLE,37,999,8.483951,0.001,0.0015,Su2020,3.0,SLE


##### LEfSe #####

In [32]:
# automatic parsing of pairwise lefses for queries
import itertools
# list(itertools.combinations(['A','B','C'], 2))

jobns = []
taxas = []
mwus = []
mwups = []
tts = []
ttps = []


# dict to store resluts
q_to_t = {}
queries = ['f__Rikenellaceae', 'g__Phasco','g__Prevotella']
# queries = ['f__Tannerell','g__Parabacter']
# queries = [x.replace('.','|') for x in LDAnan]
#queries = [x.replace('__','protect') for x in queries]
#queries = [x.replace('_','-') for x in queries]
#queries = [x.replace('protect','__') for x in queries]



# do MWU on each col and the aggregate
for q in queries:
    print('query: ' + q)

    # compile dfs 
    dfs = []
    for job in jobs:
        print(job)
        if 'jobs4' in job:
            df = pd.read_csv(path + 'outputs/' + job + '/Analysis_lefse_0/tables/Diagnosis/lefse_format.taxa_table_L7.Diagnosis.NA.tsv', 
                             sep='\t', 
                             header=None)
        else:
            # for AMPAIM jobs
            df = pd.read_csv(path + 'outputs/' + job + '/lefse_table.tsv', 
                             sep='\t',
                             header=None)
        df = df.T
        df.columns = df.iloc[0,:]
        df = df.drop(0) # drops the row of col names
        df = df.iloc[:,1:] # drops one of the Diagnosis cols
                
        # rename dx
        # df = df.rename(columns=dx_sub_dict)
                
        df['SampleID'] = job + '_' + df['HostSubjectId'] + '_' + df['Diagnosis'].replace(sub_dict)
        df = df.drop(['Diagnosis','HostSubjectId'], axis=1)
        df = df.set_index('SampleID')

        
        # normalize columns
        df = df.astype(float)
        df = df.div(df.sum(axis=1),axis=0)
        
        # put back diagnosis col
        df['Diagnosis'] = df.index.map(lambda x: x.split('_')[-1])

        # if AMPAIM job not jobs09, drop the healthy samples
        if job in ['jobs10','jobs11','jobs12','jobs13','jobs14']:
            df = df[df['Diagnosis'] != 'HC']
        
        # find relevant taxa to the query
        int_taxa = []
        for f in df.columns.values:
            if q in f:
                # print(f)
                int_taxa.append(f)
        print(len(int_taxa))
            
        # subset on taxa of interest
        df_sub = df[int_taxa] 
        
        # create new total 'collapsed genus'
        df_sub['total_' + q] = df_sub.sum(axis=1)
        
        # bring back Diagnosis column
        df_sub['Diagnosis'] = df['Diagnosis']
        
        # make cohort col
        df_sub['Cohort'] = j_to_res[job]['Cohort']

        #
        print(df_sub.Diagnosis.value_counts())
        # add to list of dfs
        dfs.append(df_sub)

    # create big df
    df_sub = pd.concat(dfs)

    # for the taxa
    for t in df_sub.columns:
        if '__' in t:
            diagnoses = df_sub['Diagnosis'].unique()
            
            # do all pairwise combos
            # for pair in list(itertools.combinations(diagnoses, 2)):
            #     d0, d1 = pair
            #     #print(t)
            #     #print(d0,d1)
            #     df_d0 = df_sub[df_sub['Diagnosis'] == d0]
            #     df_d1 = df_sub[df_sub['Diagnosis'] == d1]
            #     u,p1 = scipy.stats.mannwhitneyu(df_d0[t],df_d1[t])
            #     #print('MWU: u=' + str(np.round(u,2)) + ', p=' + str(np.round(p1,3)))
            #     r,p2 = scipy.stats.ttest_ind(df_d0[t],df_d1[t])
            #     #print('TT: t=' + str(np.round(r,2)) + ', p=' + str(np.round(p2,3)))
            #     #print('\n')
            #     jobns.append(job)
            #     taxas.append(t)
            #     mwus.append(u)
            #     mwups.append(p1)
            #     tts.append(r)
            #     ttps.append(p2)
        
            #     plt.figure(figsize=(4,3))
            #     sns.boxplot(data=df_sub, x='Diagnosis', y=t, showfliers=False)
            #     sns.stripplot(data=df_sub, x='Diagnosis', y=t,legend=None)
            #     plt.tight_layout()
            #     sns.despine()
            #     plt.savefig(path + 'outputs/jobs100/plots/' + d0 + '_' + d1 + '_' + t + '.pdf')
            #     plt.close()
        
            # boxplot across all studies
            plt.figure(figsize=(8,6))
            sns.boxplot(data=df_sub, x='Cohort',hue='Diagnosis', y=t, showfliers=False)
            # sns.stripplot(data=df_sub, x='Cohort', hue='Diagnosis', y=t,legend=None)
            plt.tight_layout()
            sns.despine()
            plt.savefig(path + 'outputs/jobs100/plots/all_' + t.split('f__')[-1] + '.pdf')
            plt.close()


df_ref = pd.DataFrame({'job': jobns, 'taxa': taxas, 'MWU': mwus, 'MWU_p': mwups, 'ttest': tts, 'ttest_p': ttps})
df_ref

query: f__Rikenellaceae
jobs41
11
Diagnosis
SLE+G    20
HC       20
SLE      17
Name: count, dtype: int64
jobs44
13
Diagnosis
SjD    63
HC     24
Name: count, dtype: int64
jobs45
11
Diagnosis
PsO    39
HC     21
Name: count, dtype: int64
jobs46
7
Diagnosis
RA    26
HC    26
Name: count, dtype: int64
jobs49
8
Diagnosis
PsO    26
HC      9
Name: count, dtype: int64
query: g__Phasco
jobs41
4
Diagnosis
SLE+G    20
HC       20
SLE      17
Name: count, dtype: int64
jobs44
6
Diagnosis
SjD    63
HC     24
Name: count, dtype: int64
jobs45
5
Diagnosis
PsO    39
HC     21
Name: count, dtype: int64
jobs46
4
Diagnosis
RA    26
HC    26
Name: count, dtype: int64
jobs49
1
Diagnosis
PsO    26
HC      9
Name: count, dtype: int64
query: g__Prevotella
jobs41
15
Diagnosis
SLE+G    20
HC       20
SLE      17
Name: count, dtype: int64
jobs44
55
Diagnosis
SjD    63
HC     24
Name: count, dtype: int64
jobs45
14
Diagnosis
PsO    39
HC     21
Name: count, dtype: int64
jobs46
14
Diagnosis
RA    26
HC    26
Name:

Unnamed: 0,job,taxa,MWU,MWU_p,ttest,ttest_p
