In [1]:
# imports
import sys
import pandas as pd
import re

# constants
MIN_T = 3
MAX_T = 28

In [2]:
# previous label sets
mdro_pos_large = pd.read_csv('./mdro_large_metadata.csv')
mdro_pos_large.sort_values(['StudyID', 'SampleDate'], inplace=True)
mdro_pos_large.index = mdro_pos_large['Stool ID']
mdro_pos_small = pd.read_csv('./mdro_pos.csv')
mdro_pos_small.sort_values(['StudyID', 'SampleDate'], inplace=True)
mdro_pos_small.index = mdro_pos_small['Stool ID']


# previous StudyID
mdro_pos_large_sid = mdro_pos_large.StudyID.unique()
mdro_pos_small_sid = mdro_pos_small.StudyID.unique()

In [3]:
# raw data
md = pd.read_excel('PLT_all_stool_isolates_seq_old.xlsx')
md.sort_values(['StudyID', 'SampleDate'], inplace=True)

# select only the patients relevant for each dataset
md_large = md.loc[md.StudyID.isin(mdro_pos_large_sid),:]
md_large.reset_index(drop=True, inplace=True)
md_small = md.loc[md.StudyID.isin(mdro_pos_small_sid),:]
md_small.reset_index(drop=True, inplace=True)

# assert patient list is the same, in the same order
assert(all(md_large.StudyID.unique() == mdro_pos_large_sid))
assert(all(md_small.StudyID.unique() == mdro_pos_small_sid))

In [4]:
def any_MDRO(s):
    rgx=r'3\d{4}'
    esbl, kpc, vre = s[['ESBLisolate', 'KPCisolate', 'VRE +/-']]
    esbl = str(esbl)
    kpc = str(kpc)
    vre = str(vre)
    if ('+' in esbl or '+' in kpc or '+' in vre):
        return True
    elif re.search(rgx, esbl) or re.search(rgx, kpc) or re.search(rgx, vre):
        return True
    else:
        return False

In [5]:
# MDRO_POS_SMALL ANALYSIS

min_t = pd.Timedelta(days=MIN_T)
max_t = pd.Timedelta(days=MAX_T)
corrected_mdro_pos_small = pd.DataFrame(
    columns = ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-', 'MDRO'] + [
        'Future_Stool_ID', 'Future_SampleDate', 'Future_ESBLisolate', 'Future_KPCisolate', 'Future_VRE_+/-', 'MIN_T', 'MAX_T', 'persistence'
    ]
)

# Get a pointer to the location of each sample in mdro_pos to its location in metadata. 
small_stool_mdidx = md_small.index[md_small['Stool ID'].isin(mdro_pos_small['Stool ID'])]

# make sure the samples pointed to are the same
assert(all(md_small.loc[small_stool_mdidx, 'Stool ID'].values == mdro_pos_small['Stool ID'].values))

# md_small and mdro_pos_small are both ordered by ['Study ID', 'Sample Date'], and there is one sample per Study ID (patient) in mdro_pos_small
# Therefore the ith sample in mdro_stool_mdidx is from the same StudyID as the ith iterate of md_small.groupby('StudyID) 
for i, (sid, df) in enumerate(md_small.groupby('StudyID')):
    
    # For the the current patient (Study ID), 
    # get the sample selected in mdro_pos_small plus all samples collected after it
    df_sample_future = df.loc[small_stool_mdidx[i]:, ]
    
    # filter on the time window
    # Get the delta in days between the sample in mdro_pos and future sample
    delta_t = df_sample_future.loc[:, 'SampleDate'] - df.loc[small_stool_mdidx[i], 'SampleDate']
    
    # only select samples in the [MIN_T, MAX_T]  window
    t_filter = ((min_t <= delta_t) & (delta_t <= max_t))
    df_sample_future = df_sample_future.loc[t_filter, :]
    
    # assign persistence label based on closest collected future sample
    persistence = any_MDRO(df_sample_future.iloc[0, :])
    
    # store mdro_pos_small metadata
    corrected_mdro_pos_small.loc[len(corrected_mdro_pos_small), :] = \
        list(df.loc[small_stool_mdidx[i], ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [any_MDRO(df.loc[small_stool_mdidx[i], :])] + \
        list(df_sample_future.iloc[0, :][['Stool ID', 'SampleDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [MIN_T, MAX_T, persistence]
corrected_mdro_pos_small.to_csv('corrected_mdro_pos_small.csv')
corrected_mdro_pos_small.persistence.value_counts() 
    
    

persistence
True     39
False    10
Name: count, dtype: int64

In [6]:
# MDRO_POS_LARGE_ANALYSIS

min_t = pd.Timedelta(days=MIN_T)
max_t = pd.Timedelta(days=MAX_T)
corrected_mdro_pos_large = pd.DataFrame(
    columns = ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-', 'MDRO'] + [
        'Future_Stool_ID', 'Future_SampleDate', 'Future_ESBLisolate', 'Future_KPCisolate', 'Future_VRE_+/-', 'MIN_T', 'MAX_T', 'persistence'
    ]
)

# Get a pointer to the location of each sample in mdro_pos to its location in metadata. 
large_stool_mdidx = md_large.index[md_large['Stool ID'].isin(mdro_pos_large['Stool ID'])]

# make sure the samples pointed to are the same
assert(all(md_large.loc[large_stool_mdidx, 'Stool ID'].values == mdro_pos_large['Stool ID'].values))

# md_large and mdro_pos_large are both ordered by ['Study ID', 'Sample Date'], and there is one sample per Study ID (patient) in mdro_pos_large
# Therefore the ith sample in mdro_stool_mdidx is from the same StudyID as the ith iterate of md_large.groupby('StudyID) 
for i, (sid, df) in enumerate(md_large.groupby('StudyID')):
    
    # For the the current patient (Study ID), 
    # get the sample selected in mdro_pos_large plus all samples collected after it
    df_sample_future = df.loc[large_stool_mdidx[i]:, ]
    
    # filter on the time window
    # Get the delta in days between the sample in mdro_pos and future sample
    delta_t = df_sample_future.loc[:, 'SampleDate'] - df.loc[large_stool_mdidx[i], 'SampleDate']
    
    # only select samples in the [MIN_T, MAX_T]  window
    t_filter = ((min_t <= delta_t) & (delta_t <= max_t))
    df_sample_future = df_sample_future.loc[t_filter, :]
    
    # assign persistence label based on closest collected future sample
    persistence = any_MDRO(df_sample_future.iloc[0, :])
    
    # store mdro_pos_large metadata
    corrected_mdro_pos_large.loc[len(corrected_mdro_pos_large), :] = \
        list(df.loc[large_stool_mdidx[i], ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [any_MDRO(df.loc[large_stool_mdidx[i], :])] + \
        list(df_sample_future.iloc[0, :][['Stool ID', 'SampleDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [MIN_T, MAX_T, persistence]
corrected_mdro_pos_large.to_csv('corrected_mdro_pos_large.csv')
corrected_mdro_pos_large.persistence.value_counts() 
    

persistence
True     58
False    40
Name: count, dtype: int64

In [8]:
import numpy as np


In [9]:
islts = pd.read_excel('/Users/talkorem/Library/CloudStorage/Dropbox/Columbia/Projects/ACU_PLT/PLT_all_stool_isolates.xlsx')

islts.rename(columns = {'Stool ID':'StoolID', 'VRE +/-':'VRE'}, inplace = True)

islts = islts[islts.StudyID.notnull()] #lose 2 samples
islts = islts[~islts.ESBLisolate.apply(lambda v: type(v) is str and "SAME AS" in v)]#remove 4 duplciates
islts.head()

islts['ESBL'] = islts.ESBLisolate.replace([0, 'N', 'No', '-', 'n', 'N (NO MAC GROWTH)', 'N (ODD)'], 'No').\
    replace(['?', '35379- FALSE + DUE TO BAD PLATES', '35387 (FALSE + DUE TO BAD PLATES)',
            '35393 (FALSE + DUE TO BAD PLATES)'], np.nan).\
     apply(lambda v: np.nan if type(v) is str and 'BOTH FALSE + DUE TO BAD PLATES' in v else v)

islts['KPC'] = islts.KPCisolate.replace(['N', 'n', '-', 0, 'N (AFTER SUB CX)', 'N (NO MAC GROWTH)'], 'No').\
                 replace(['?', 'N (SMALL BLUE?)', '?N '], np.nan)

islts['VRE'] = islts.VRE.replace(['N', '-', 'n', '0'], 'No').replace(['+ (NEG on reculture 3/31)', 'NEG on reculture 4/12'], np.nan).\
    replace(['+', '+ (not pink)', '+ (mauve + blue)', '+ (pink + white)', '+ (mauve)', '+ (white)'], 'Yes')


islts = islts.drop(['ESBLisolate', 'KPCisolate'], axis = 1)

#PTNT 186 IS WEIRD W 31331 31332
dupstlid = [31012, 30522, 30100, 30176, 30947, 30327, 30483, 30496, 30919, 31098, 
            31332, 30973, 31277, 31820, 31501, 31899, 31828, 31884, 31808, 32033]

print(islts.shape)
islts = islts[~islts.StoolID.isin(dupstlid)]
print(islts.shape)

(2056, 7)
(2036, 7)


In [28]:
islts['Clear'] = (islts[['VRE', 'ESBL', 'KPC']] == 'No').all(1)


In [37]:
islts['Positive'] = islts[['VRE', 'ESBL', 'KPC']].applymap(lambda v: v != 'No' and pd.notnull(v)).any(axis=1)


  islts['Positive'] = islts[['VRE', 'ESBL', 'KPC']].applymap(lambda v: v != 'No' and pd.notnull(v)).any(axis=1)


In [84]:
git = iter(islts.groupby('StudyID'))

In [95]:
def get_outcome(g):
    g = g.sort_values('SampleDate')
    g = g[(g[['Clear', 'Positive']] == True).any(axis=1)]
    gj = g.join(g.shift(-1), rsuffix='_nxt')
    gj = gj[((gj.SampleDate_nxt - gj.SampleDate) >= min_t) & ((gj.SampleDate_nxt - gj.SampleDate) < max_t)]
    gj['Outcome'] = gj.apply(lambda r: 'Clearance' if r['Clear_nxt'] == True else 'Persistence' if r['Positive_nxt'] == True else np.nan, axis = 1)
    return gj

In [109]:
outcome = islts.groupby('StudyID').apply(get_outcome, include_groups=False).reset_index().drop('level_1', axis = 1)
outcome = outcome[outcome.Positive]


In [114]:
corrected_mdro_pos_small[['Stool ID', 'persistence']].merge(outcome, left_on='Stool ID', right_on = 'StoolID', how='left')[['persistence', 'Outcome']].value_counts(dropna=False).sort_index()


persistence  Outcome    
False        Clearance       8
             Persistence     1
             NaN             1
True         Persistence    36
             NaN             3
Name: count, dtype: int64

In [115]:
corrected_mdro_pos_large[['Stool ID', 'persistence']].merge(outcome, left_on='Stool ID', right_on = 'StoolID', how='left')[['persistence', 'Outcome']].value_counts(dropna=False).sort_index()

persistence  Outcome    
False        Clearance      34
             NaN             6
True         Persistence    54
             NaN             4
Name: count, dtype: int64