In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from skbio.diversity.alpha import shannon
from sklearn.metrics import pairwise_distances
from collections import Counter
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr,kendalltau
from scipy.cluster.hierarchy import leaves_list
from scipy import stats
from statsmodels.stats.multitest import multipletests

In [2]:
# read sample meta data and eliminate samples without transplant day
df_sample = pd.read_csv('tblASVsamples.csv', index_col=0)
df_sample = df_sample[df_sample.DayRelativeToNearestHCT.notnull()]

# read count data
df_count_stacked = pd.read_csv('tblcounts_asv_melt.csv')
df_count_stacked = pd.pivot_table(df_count_stacked, index='SampleID', columns='ASV', values='Count', aggfunc=np.sum).fillna(0)
df_count_stacked = df_count_stacked[df_count_stacked.sum(axis=1)>=1000]
df_count_stacked = df_count_stacked.loc[:, (df_count_stacked != 0).any(axis=0)]
print("min seq depth = %d" %(df_count_stacked.sum(axis=1).min()))
df_relab_asv =  df_count_stacked.div(df_count_stacked.sum(axis=1), axis=0)

# find commmon samples
common_samples = set(df_sample.index).intersection(set(df_relab_asv.index))
df_sample = df_sample.loc[common_samples]
df_relab_asv = df_relab_asv.loc[common_samples]

min seq depth = 1001


In [8]:
# get oral bacterial fraction
df_blast_100 = pd.read_csv("blast_HMPv35oral/blast_HMPv35oral_p100.txt", sep="\t", comment="#", header=None)
df_blast_100.columns = ['query_accver', 'subject_accver', 'perc_identity', 'alignment_length', 'mismatches', 'gap_opens', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']#
df_oral_total = df_relab_asv[set(df_blast_100.query_accver).intersection(df_relab_asv.columns)].sum(axis=1).to_frame()
df_oral_total.columns = ['OralFrac_HMPv35oral']
df_oral_total = df_oral_total.reset_index('SampleID').sort_values(['OralFrac_HMPv35oral','SampleID']).set_index('SampleID')
df_oral_asv = pd.merge(df_oral_total, df_relab_asv, left_index=True, right_index=True, how='inner')

In [None]:
res = []
for col in df_oral_asv.columns:
    rho, p = stats.spearmanr(df_oral_asv.OralFrac_HMPv35oral, df_oral_asv[col])
    res.append([col, rho, p])
df_res = pd.DataFrame(res, columns=['ASV','Coef','P'])
df_res = df_res[df_res.P.notnull()]
df_res['Padj'] = multipletests(df_res['P'], alpha=0.05, method='fdr_bh')[1]
df_res['Is_Oral'] = 0
df_res.loc[df_res.ASV.isin(df_blast_100.query_accver), 'Is_Oral'] = 1
df_tax = pd.read_csv('tblASVtaxonomy_silva138_v4v5_filter.csv')
df_res = pd.merge(df_res, df_tax[['ASV','LowestClassifiedTaxa']], left_on='ASV', right_on='ASV', how='left')
df_res.head()

