In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy.optimize import lsq_linear

# Load metadata and count table

In [6]:
# read sample meta data and eliminate samples without transplant day
df_sample = pd.read_csv('tblASVsamples.csv', index_col=0)
df_sample = df_sample[df_sample.DayRelativeToNearestHCT.notnull()]

# read count data
df_count = pd.read_csv('tblcounts_asv_melt.csv')
df_count = pd.pivot_table(df_count, index='SampleID', columns='ASV', values='Count', aggfunc=np.sum).fillna(0)
sample_ids_w_suff_reads = list(df_count.sum(axis=1)[df_count.sum(axis=1)>=1000].index)
df_count = df_count.loc[set(df_sample.index).intersection(set(sample_ids_w_suff_reads))]
df_relab =  df_count.div(df_count.sum(axis=1), axis=0)
                         
# find commmon samples
common_samples = set(df_sample.index).intersection(set(df_relab.index))
df_sample = df_sample.loc[common_samples]
df_relab = df_relab.loc[common_samples]

# Compute oral fraction

In [7]:
# get oral bacterial fraction
df_blast = pd.read_csv("blast_HMPv35oral/blast_HMPv35oral_p100.txt", sep="\t", comment="#", header=None)
df_blast.columns = ['query_accver', 'subject_accver', 'perc_identity', 'alignment_length', 'mismatches', 'gap_opens', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']#
df_oral_total = df_relab[set(df_blast.query_accver).intersection(df_relab.columns)].sum(axis=1).to_frame()
df_oral_total.columns = ['OralFrac_HMPv35oral']
df_oral_total = df_oral_total.reset_index('SampleID').sort_values(['OralFrac_HMPv35oral','SampleID']).set_index('SampleID')
df_oral_total.head()

Unnamed: 0_level_0,OralFrac_HMPv35oral
SampleID,Unnamed: 1_level_1
1001,0.0
1002,0.0
1008,0.0
1017,0.0
103A,0.0


# Load qPCR table

In [8]:
df_qpcr = pd.read_csv("tblqpcr.csv", index_col=0)
df_qpcr.head()

Unnamed: 0_level_0,qPCR16S
SampleID,Unnamed: 1_level_1
1015A,53793850.0
1015D,3877.487
1015G,5280994.0
1015H,733244.6
1016A,5752710.0


# Load antibiotic table

In [9]:
df_drug = pd.read_csv("tbldrug.csv", low_memory=False)
df_drug = df_drug[df_drug.AntiInfective==True].reset_index(drop=True)

# convert to single day entry
df_drug_single_day = []
for index in df_drug.index:
    start_tps = df_drug.loc[index,'StartTimepoint']
    stop_tps = df_drug.loc[index,'StopTimepoint']
    start_day = df_drug.loc[index,'StartDayRelativeToNearestHCT']
    for timepoint in np.arange(start_tps,stop_tps+1):
        res = list(df_drug.loc[index])
        res.append(timepoint)
        res.append(timepoint-start_tps+start_day)
        df_drug_single_day.append(res)
df_drug_single_day = pd.DataFrame(df_drug_single_day, columns=list(df_drug.columns)+['Timepoint','DayRelativeToNearestHCT'])
df_drug_single_day = df_drug_single_day.drop(['StartTimepoint','StopTimepoint','StartDayRelativeToNearestHCT','StopDayRelativeToNearestHCT'], axis=1)
df_drug = deepcopy(df_drug_single_day)
df_drug = df_drug[df_drug.DayRelativeToNearestHCT.notnull()]
df_drug = df_drug.drop_duplicates()

# only keep antibiotics
df_drug = df_drug[df_drug.Category.isin(['aminoglycosides','carbapenems','cephalosporins','glycopeptide antibiotics','glycylcyclines',
                                         'leprostatics','lincomycin derivatives','macrolide derivatives','miscellaneous antibiotics',
                                         'oxazolidinone antibiotics', 'penicillins','quinolones','sulfonamides','tetracyclines'])]


df_drug.head()

Unnamed: 0,PatientID,Factor,Category,AntiInfective,Route,Timepoint,DayRelativeToNearestHCT
0,1000,ciprofloxacin,quinolones,True,intravenous,-160,-169
2,1000,aztreonam,miscellaneous antibiotics,True,intravenous,-151,-160
3,1000,vancomycin,glycopeptide antibiotics,True,intravenous,-151,-160
4,1000,aztreonam,miscellaneous antibiotics,True,intravenous,-150,-159
6,1000,vancomycin,glycopeptide antibiotics,True,intravenous,-150,-159


# Find samples >=X% oral bacteria, no antibiotioc on that day and qPCR are available

In [None]:
for perc in [0.3,0.4,0.5,0.6,0.7]:
    res = []
    pid_visited = []
    a = []
    b = []
    for sid in df_oral_total[df_oral_total.OralFrac_HMPv35oral>=perc].index:
        tps = df_sample.loc[sid,'Timepoint']
        pid = df_sample.loc[sid,'PatientID']
        if pid in pid_visited:
            continue
        curr_df_drug = df_drug[(df_drug.PatientID==pid) & (df_drug.Timepoint==tps)]

        # no antibiotic use on the day of the sample collection
        if len(curr_df_drug)==0:

            # find out all samples that do not have antibiotics
            curr_df_sample = df_sample[df_sample.PatientID==pid]
            curr_df_drug = df_drug[(df_drug.PatientID==pid)]
            filtered_curr_df_sample = curr_df_sample[~curr_df_sample.Timepoint.isin(list(curr_df_drug.Timepoint))]
            filtered_curr_df_sample = pd.merge(filtered_curr_df_sample, df_oral_total, left_index=True, right_index=True)
            filtered_curr_df_sample = pd.merge(filtered_curr_df_sample, df_qpcr, left_index=True, right_index=True).sort_values('Timepoint')
            if len(filtered_curr_df_sample) == 0:
                continue

            # solve a linear regression problem
            for x,y in zip(filtered_curr_df_sample.OralFrac_HMPv35oral, filtered_curr_df_sample.qPCR16S):
                a.append([y*x, y*(1-x)])
                b.append(1)
            pid_visited.append(pid)
    a = np.array(a)
    b = np.array(b)
    res=lsq_linear(a,b,lsmr_tol='auto', verbose=0)
    K_o,K_g = 1./res.x
    print(perc, len(pid_visited), "K_g/K_o = %2.2f" % (K_g/K_o))

In [10]:
2.92e10/21.07

1385856668.248695