In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
import warnings
warnings.filterwarnings('ignore')

# Load relative abundance data from MSKCC alloHCT

In [2]:
df_sample = pd.read_excel("../Fig3|FigS1|EDFig6_9_10|TableS5//ST4_oralASV_alloHCT.xlsx", sheet_name='Table S4b')
df_sample['Patient ID'] = df_sample['Patient ID'].astype(str)
df_sample['Sample ID'] = df_sample['Sample ID'].astype(str)
df_sample.columns = ['SampleID','PatientID','DayRelativeToNearestHCT','TotalLoad','OralFraction','OralLoad','GutFraction','GutLoad','Timepoint','StoolConsistency','FungalCulturability']
df_sample = df_sample.set_index('SampleID')
df_sample['log10_TotalLoad'] = np.log10(df_sample.TotalLoad)
df_sample.head()

Unnamed: 0_level_0,PatientID,DayRelativeToNearestHCT,TotalLoad,OralFraction,OralLoad,GutFraction,GutLoad,Timepoint,StoolConsistency,FungalCulturability,log10_TotalLoad
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000A,1000,-9,,0.06544,,0.93456,,0,formed,,
1000B,1000,-4,,0.270878,,0.729122,,5,liquid,,
1000C,1000,6,,0.000752,,0.999248,,15,liquid,,
1000D,1000,9,,0.149727,,0.850273,,18,semi-formed,,
1000E,1000,13,,0.010265,,0.989735,,22,formed,,


In [3]:
df_lowtaxa = pd.read_csv("../Fig3|FigS1|EDFig6_9_10|TableS5/lowest_taxa_relative_abundance_wide_format.csv.gz", compression='gzip', index_col=0)
df_lowtaxa.head()

Unnamed: 0_level_0,28-4,ASF356,Abiotrophia,Absconditabacteriales (SR1),Acetanaerobacterium,Acetobacteraceae,Achromobacter,Acidaminococcaceae,Acidaminococcus,Acidiphilium,...,[Eubacterium] nodatum group,[Eubacterium] ruminantium group,[Eubacterium] saphenum group,[Eubacterium] siraeum group,[Eubacterium] ventriosum group,[Eubacterium] xylanophilum group,[Ruminococcus] gauvreauii group,[Ruminococcus] gnavus group,[Ruminococcus] torques group,vadinBE97
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087771,0.001554,0.0
1000B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Find samples with taxonomic dominations

In [4]:
df_domination = (df_lowtaxa.iloc[:,4:-2]>0.3).astype(int).T.sum(axis=1).sort_values().to_frame()
df_domination.columns = ['NSampleDomination']
df_domination = df_domination[df_domination.NSampleDomination>=100]
selected_taxa = list(df_domination.index)
df2_sample = df_sample[(df_sample.TotalLoad >= 1e3) & (df_sample.DayRelativeToNearestHCT >= -20) & (df_sample.DayRelativeToNearestHCT <= 40)]
df_selected_taxa = pd.merge(
    df2_sample[['log10_TotalLoad']],
    df_lowtaxa[selected_taxa],
    left_index=True,
    right_index=True,
    how='inner')
df_selected_taxa.head()

Unnamed: 0_level_0,log10_TotalLoad,Lachnospiraceae,Actinomyces,Pediococcus,Klebsiella,Bifidobacterium,Staphylococcus,Escherichia-Shigella,Bacteroides,Akkermansia,Erysipelatoclostridium,[Clostridium] innocuum group,Lactobacillus,Blautia,Streptococcus,Enterococcus
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1015A,7.730733,0.0,0.00793,0.033501,0.0,0.206506,0.0,0.000647,0.0,0.0,0.0,0.0,0.0,0.0,0.033986,0.659654
1015D,3.58855,0.0004,0.0,0.0,0.0003,0.0,0.002503,0.0,0.0,0.0,0.0004,0.0,0.002402,0.000801,0.002803,0.910711
1016A,6.759872,0.0,0.041761,0.0,0.0,0.0,0.001589,0.636632,0.008171,0.0,0.003177,0.066046,0.027009,0.016568,0.103722,0.0
1042A,9.15809,0.006825,0.006279,0.0,0.0,0.009555,0.0,0.001365,0.0,0.000956,0.001911,0.001365,0.001638,0.149058,0.017472,0.000956
1042AA,7.812392,0.001194,0.004449,0.0,0.0,0.0,0.0,0.0,0.002496,0.000597,0.0,0.050236,0.020778,0.039006,0.053057,0.005751


In [5]:
res = []
for taxon in selected_taxa:
    df2_selected_taxa = df_selected_taxa[df_selected_taxa[taxon]>0]
    slope, intercept, r_value, p_value, std_err = linregress(np.log10(df2_selected_taxa[taxon]), df2_selected_taxa['log10_TotalLoad'], alternative='less')
    res.append([taxon, len(df2_selected_taxa), slope, std_err])
df_res = pd.DataFrame(res, columns=['Taxon','NSampleDomination','Slope','SE']).set_index('Taxon')
df_res = df_res.drop('Lachnospiraceae', axis=0).rename({"[Clostridium] innocuum group":"[Clostridium]-innocuum-group"}, axis=0).sort_values('Slope', ascending=False)
df_res

Unnamed: 0_level_0,NSampleDomination,Slope,SE
Taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blautia,2202,0.254893,0.019089
Erysipelatoclostridium,1860,0.159166,0.021338
[Clostridium]-innocuum-group,1961,0.135317,0.02092
Akkermansia,1294,0.13315,0.026874
Bifidobacterium,1500,0.113423,0.025632
Enterococcus,2227,0.100613,0.018884
Bacteroides,1598,0.094938,0.025773
Escherichia-Shigella,1020,0.052193,0.033072
Klebsiella,759,0.041822,0.040207
Pediococcus,777,0.041583,0.035695
