In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy.optimize import lsq_linear

# Load metadata and count table

In [2]:
# read sample meta data and eliminate samples without transplant day
df_sample = pd.read_csv('tblASVsamples.csv', index_col=0)
df_sample = df_sample[df_sample.DayRelativeToNearestHCT.notnull()]

# read count data
df_count_stacked = pd.read_csv('tblcounts_asv_melt.csv')
df_count_stacked = pd.pivot_table(df_count_stacked, index='SampleID', columns='ASV', values='Count', aggfunc=np.sum).fillna(0)
df_relab_asv =  df_count_stacked.div(df_count_stacked.sum(axis=1), axis=0)

# find commmon samples
common_samples = set(df_sample.index).intersection(set(df_relab_asv.index))
df_sample = df_sample.loc[common_samples]
df_relab_asv = df_relab_asv.loc[common_samples]

# Compute oral fraction

In [3]:
# get oral bacterial fraction
df_blast_99 = pd.read_csv("blast_ungapped_filteredHOMD/blast_filteredHOMD_p99.txt", sep="\t", comment="#", header=None)
df_blast_99.columns = ['query_accver', 'subject_accver', 'perc_identity', 'alignment_length', 'mismatches', 'gap_opens', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']#
df_oral_total = df_relab_asv[set(df_blast_99.query_accver).intersection(df_relab_asv.columns)].sum(axis=1).to_frame()
df_oral_total.columns = ['OralFrac_CuratedHOMD']
df_oral_total = df_oral_total.reset_index('SampleID').sort_values(['OralFrac_CuratedHOMD','SampleID']).set_index('SampleID')
df_oral_total.head()

Unnamed: 0_level_0,OralFrac_CuratedHOMD
SampleID,Unnamed: 1_level_1
1002,0.0
1006,0.0
1008,0.0
1013,0.0
1017,0.0


# Load qPCR table

In [4]:
df_qpcr = pd.read_csv("tblqpcr.csv", index_col=0)
df_qpcr.head()

Unnamed: 0_level_0,qPCR16S
SampleID,Unnamed: 1_level_1
1015A,53793850.0
1015D,3877.487
1015G,5280994.0
1015H,733244.6
1016A,5752710.0


# Load antibiotic table

In [5]:
df_drug = pd.read_csv("tbldrug.csv", low_memory=False)
df_drug = df_drug[df_drug.AntiInfective==True].reset_index(drop=True)

# convert to single day entry
df_drug_single_day = []
for index in df_drug.index:
    start_tps = df_drug.loc[index,'StartTimepoint']
    stop_tps = df_drug.loc[index,'StopTimepoint']
    start_day = df_drug.loc[index,'StartDayRelativeToNearestHCT']
    for timepoint in np.arange(start_tps,stop_tps+1):
        res = list(df_drug.loc[index])
        res.append(timepoint)
        res.append(timepoint-start_tps+start_day)
        df_drug_single_day.append(res)
df_drug_single_day = pd.DataFrame(df_drug_single_day, columns=list(df_drug.columns)+['Timepoint','DayRelativeToNearestHCT'])
df_drug_single_day = df_drug_single_day.drop(['StartTimepoint','StopTimepoint','StartDayRelativeToNearestHCT','StopDayRelativeToNearestHCT'], axis=1)
df_drug = deepcopy(df_drug_single_day)
df_drug = df_drug[df_drug.DayRelativeToNearestHCT.notnull()]
df_drug = df_drug.drop_duplicates()

# only keep antibiotics
df_drug = df_drug[df_drug.Category.isin(['aminoglycosides','carbapenems','cephalosporins','glycopeptide antibiotics','glycylcyclines',
                                         'leprostatics','lincomycin derivatives','macrolide derivatives','miscellaneous antibiotics',
                                         'oxazolidinone antibiotics', 'penicillins','quinolones','sulfonamides','tetracyclines'])]


df_drug.head()

Unnamed: 0,PatientID,Factor,Category,AntiInfective,Route,Timepoint,DayRelativeToNearestHCT
0,1000,ciprofloxacin,quinolones,True,intravenous,-160,-169
2,1000,aztreonam,miscellaneous antibiotics,True,intravenous,-151,-160
3,1000,vancomycin,glycopeptide antibiotics,True,intravenous,-151,-160
4,1000,aztreonam,miscellaneous antibiotics,True,intravenous,-150,-159
6,1000,vancomycin,glycopeptide antibiotics,True,intravenous,-150,-159


# Find samples >=X% oral bacteria, no antibiotioc on that day and qPCR are available

# Compute K_g/K_o ratio per patient

In [6]:
res = []
pid_visited = []
for sid in df_oral_total[df_oral_total.OralFrac_CuratedHOMD>=0.8].index:
    tps = df_sample.loc[sid,'Timepoint']
    pid = df_sample.loc[sid,'PatientID']
    if pid in pid_visited:
        continue
    curr_df_drug = df_drug[(df_drug.PatientID==pid) & (df_drug.Timepoint==tps)]
    
    # no antibiotic use on the day of the sample collection
    if len(curr_df_drug)==0:
        
        # find out all samples that do not have antibiotics
        curr_df_sample = df_sample[df_sample.PatientID==pid]
        curr_df_drug = df_drug[(df_drug.PatientID==pid)]
        filtered_curr_df_sample = curr_df_sample[~curr_df_sample.Timepoint.isin(list(curr_df_drug.Timepoint))]
        filtered_curr_df_sample = pd.merge(filtered_curr_df_sample, df_oral_total, left_index=True, right_index=True)
        filtered_curr_df_sample = pd.merge(filtered_curr_df_sample, df_qpcr, left_index=True, right_index=True).sort_values('Timepoint')
        if len(filtered_curr_df_sample) <= 1:
            continue
        
        # solve a linear regression problem
        a = []
        for x,y in zip(filtered_curr_df_sample.OralFrac_CuratedHOMD, filtered_curr_df_sample.qPCR16S):
            a.append([y*x, y*(1-x)])
        a = np.array(a)
        b = np.array([1]*len(filtered_curr_df_sample))
        res=lsq_linear(a,b,lsmr_tol='auto', verbose=0)
        K_o,K_g = 1./res.x
        pid_visited.append(pid)
        print("Patient ID = %s, number of samples = %d, K_g/K_o = %2.2f" % (pid, len(filtered_curr_df_sample), K_g/K_o))

Patient ID = FMT.0181, number of samples = 2, K_g/K_o = 15.86
Patient ID = FMT.0126, number of samples = 11, K_g/K_o = -4.45
Patient ID = FMT.0105, number of samples = 2, K_g/K_o = -0.11
Patient ID = 1167, number of samples = 8, K_g/K_o = -15.42
Patient ID = 1186, number of samples = 5, K_g/K_o = 3.69
Patient ID = FMT.0158, number of samples = 9, K_g/K_o = 0.88
Patient ID = FMT.0154, number of samples = 8, K_g/K_o = 9.69
Patient ID = 1261, number of samples = 6, K_g/K_o = 21.76


# Compute K_g/K_o ratio by combining all patients

In [7]:
for perc in [0.5,0.6,0.7,0.8,0.9]:
    res = []
    pid_visited = []
    a = []
    b = []
    for sid in df_oral_total[df_oral_total.OralFrac_CuratedHOMD>=perc].index:
        tps = df_sample.loc[sid,'Timepoint']
        pid = df_sample.loc[sid,'PatientID']
        if pid in pid_visited:
            continue
        curr_df_drug = df_drug[(df_drug.PatientID==pid) & (df_drug.Timepoint==tps)]

        # no antibiotic use on the day of the sample collection
        if len(curr_df_drug)==0:

            # find out all samples that do not have antibiotics
            curr_df_sample = df_sample[df_sample.PatientID==pid]
            curr_df_drug = df_drug[(df_drug.PatientID==pid)]
            filtered_curr_df_sample = curr_df_sample[~curr_df_sample.Timepoint.isin(list(curr_df_drug.Timepoint))]
            filtered_curr_df_sample = pd.merge(filtered_curr_df_sample, df_oral_total, left_index=True, right_index=True)
            filtered_curr_df_sample = pd.merge(filtered_curr_df_sample, df_qpcr, left_index=True, right_index=True).sort_values('Timepoint')
            if len(filtered_curr_df_sample) == 0:
                continue

            # solve a linear regression problem
            for x,y in zip(filtered_curr_df_sample.OralFrac_CuratedHOMD, filtered_curr_df_sample.qPCR16S):
                a.append([y*x, y*(1-x)])
                b.append(1)
            pid_visited.append(pid)
    a = np.array(a)
    b = np.array(b)
    res=lsq_linear(a,b,lsmr_tol='auto', verbose=0)
    K_o,K_g = 1./res.x
    print(perc, len(pid_visited), "K_g/K_o = %2.2f" % (K_g/K_o))

0.5 23 K_g/K_o = 1.25
0.6 17 K_g/K_o = -1.08
0.7 14 K_g/K_o = -1.89
0.8 10 K_g/K_o = 21.07
0.9 5 K_g/K_o = 35.69


In [10]:
2.92e10/21.07

1385856668.248695