In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from skbio.diversity.alpha import shannon
from sklearn.metrics import pairwise_distances
from collections import Counter
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr,kendalltau
from scipy.cluster.hierarchy import leaves_list
from scipy import stats
from statsmodels.stats.multitest import multipletests

In [2]:
# read sample meta data and eliminate samples without transplant day
df_sample = pd.read_csv('tblASVsamples.csv', index_col=0)
df_sample = df_sample[df_sample.DayRelativeToNearestHCT.notnull()]

# read count data
df_count_stacked = pd.read_csv('tblcounts_asv_melt.csv')
df_count_stacked = pd.pivot_table(df_count_stacked, index='SampleID', columns='ASV', values='Count', aggfunc=np.sum).fillna(0)
df_count_stacked = df_count_stacked[df_count_stacked.sum(axis=1)>=1000]
df_count_stacked = df_count_stacked.loc[:, (df_count_stacked != 0).any(axis=0)]
print("min seq depth = %d" %(df_count_stacked.sum(axis=1).min()))
df_relab_asv =  df_count_stacked.div(df_count_stacked.sum(axis=1), axis=0)

# find commmon samples
common_samples = set(df_sample.index).intersection(set(df_relab_asv.index))
df_sample = df_sample.loc[common_samples]
df_relab_asv = df_relab_asv.loc[common_samples]

min seq depth = 1001


In [8]:
# get oral bacterial fraction
df_blast_100 = pd.read_csv("blast_HMPv35oral/blast_HMPv35oral_p100.txt", sep="\t", comment="#", header=None)
df_blast_100.columns = ['query_accver', 'subject_accver', 'perc_identity', 'alignment_length', 'mismatches', 'gap_opens', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']#
df_oral_total = df_relab_asv[set(df_blast_100.query_accver).intersection(df_relab_asv.columns)].sum(axis=1).to_frame()
df_oral_total.columns = ['OralFrac_HMPv35oral']
df_oral_total = df_oral_total.reset_index('SampleID').sort_values(['OralFrac_HMPv35oral','SampleID']).set_index('SampleID')
df_oral_asv = pd.merge(df_oral_total, df_relab_asv, left_index=True, right_index=True, how='inner')

In [17]:
res = []
for col in df_oral_asv.columns:
    rho, p = stats.spearmanr(df_oral_asv.OralFrac_HMPv35oral, df_oral_asv[col])
    res.append([col, rho, p])
df_res = pd.DataFrame(res, columns=['ASV','Coef','P'])
df_res = df_res[df_res.P.notnull()]
df_res['Padj'] = multipletests(df_res['P'], alpha=0.05, method='bonferroni')[1]
df_res['Is_Oral'] = 0
df_res.loc[df_res.ASV.isin(df_blast_100.query_accver), 'Is_Oral'] = 1
df_tax = pd.read_csv('tblASVtaxonomy_silva138_v4v5_filter.csv')
df_res = pd.merge(df_res, df_tax[['ASV','LowestClassifiedTaxa']], left_on='ASV', right_on='ASV', how='left')
df_res.head()



Unnamed: 0,ASV,Coef,P,Padj,Is_Oral,LowestClassifiedTaxa
0,OralFrac_HMPv35oral,1.0,0.0,0.0,0,
1,ASV_1,0.056296,8.713971e-09,0.0001375152,0,Lactobacillus
2,ASV_10,-0.035679,0.0002673379,1.0,0,Blautia
3,ASV_100,-0.095006,2.3592890000000003e-22,3.723194e-18,0,Bacteroides
4,ASV_1000,0.009375,0.3382954,1.0,0,Romboutsia


In [26]:
df_res[(df_res.Padj<0.05) & (df_res.Is_Oral==1)].sort_values('Coef', ascending=False).head(20)

Unnamed: 0,ASV,Coef,P,Padj,Is_Oral,LowestClassifiedTaxa
14881,ASV_8,0.722416,0.0,0.0,1,Streptococcus
6161,ASV_32,0.433785,0.0,0.0,1,Streptococcus
10657,ASV_51,0.404635,0.0,0.0,1,Actinomyces
1129,ASV_128,0.401745,0.0,0.0,1,Actinomyces
13440,ASV_635,0.348211,3.785401e-295,5.973741e-291,1,Granulicatella
5497,ASV_264,0.307141,9.292643e-227,1.4664720000000002e-222,1,Veillonella
3189,ASV_175,0.288073,1.603179e-198,2.5299760000000002e-194,1,Streptococcus
5403,ASV_258,0.283145,1.426814e-191,2.251656e-187,1,Actinomyces
5989,ASV_310,0.281201,7.193112e-189,1.1351449999999999e-184,1,Rothia
5218,ASV_247,0.278383,5.459238e-185,8.615224000000001e-181,1,Veillonellaceae


In [21]:
df_res[(df_res.Padj<0.05) & (df_res.Is_Oral==0)].sort_values('Padj').head(20)

Unnamed: 0,ASV,Coef,P,Padj,Is_Oral,LowestClassifiedTaxa
0,OralFrac_HMPv35oral,1.0,0.0,0.0,0,
2636,ASV_16,0.444504,0.0,0.0,0,Streptococcus
14948,ASV_81,0.360918,0.0,0.0,0,Veillonella
10444,ASV_500,0.324865,5.680867000000001e-255,8.964976e-251,0,Scardovia
12029,ASV_5760,0.29632,1.803815e-210,2.8466e-206,0,Lactobacillales
8206,ASV_4172,0.267265,3.992807e-170,6.301049e-166,0,Lactobacillales
7190,ASV_379,0.25693,6.228283e-157,9.828854000000002e-153,0,Firmicutes
710,ASV_115,0.244145,1.994888e-141,3.148133e-137,0,Veillonella
3807,ASV_1905,0.229846,3.9146470000000005e-125,6.1777050000000005e-121,0,Erysipelatoclostridiaceae
4916,ASV_239,0.22953,8.703054000000001e-125,1.373429e-120,0,Lactococcus
