In [1]:
# imports
import sys
import pandas as pd
import re

# constants
MIN_T = 3
MAX_T = 28

In [2]:
# previous label sets
mdro_pos_large = pd.read_csv('./mdro_large_metadata.csv')
mdro_pos_large.sort_values(['StudyID', 'SampleDate'], inplace=True)
mdro_pos_large.index = mdro_pos_large['Stool ID']
mdro_pos_small = pd.read_csv('./mdro_pos.csv')
mdro_pos_small.sort_values(['StudyID', 'SampleDate'], inplace=True)
mdro_pos_small.index = mdro_pos_small['Stool ID']


# previous StudyID
mdro_pos_large_sid = mdro_pos_large.StudyID.unique()
mdro_pos_small_sid = mdro_pos_small.StudyID.unique()

In [3]:
# raw data
md = pd.read_excel('PLT_all_stool_isolates_seq_old.xlsx')
md.sort_values(['StudyID', 'SampleDate'], inplace=True)

# select only the patients relevant for each dataset
md_large = md.loc[md.StudyID.isin(mdro_pos_large_sid),:]
md_large.reset_index(drop=True, inplace=True)
md_small = md.loc[md.StudyID.isin(mdro_pos_small_sid),:]
md_small.reset_index(drop=True, inplace=True)

# assert patient list is the same, in the same order
assert(all(md_large.StudyID.unique() == mdro_pos_large_sid))
assert(all(md_small.StudyID.unique() == mdro_pos_small_sid))

In [24]:
md.loc[md.StudyID == 33, 'KPCisolate'].astype(str).apply(lambda x: re.search(r'3\d{4}', x))

1071    <re.Match object; span=(0, 5), match='35411'>
1073                                             None
1074                                             None
1072                                             None
1066                                             None
1075                                             None
1067                                             None
1076                                             None
1068    <re.Match object; span=(0, 5), match='35665'>
1069    <re.Match object; span=(0, 5), match='35673'>
1070                                             None
Name: KPCisolate, dtype: object

In [35]:
def any_MDRO(s):
    rgx=r'3\d{4}'
    esbl, kpc, vre = s[['ESBLisolate', 'KPCisolate', 'VRE +/-']]
    esbl = str(esbl)
    kpc = str(kpc)
    vre = str(vre)
    if ('+' in esbl or '+' in kpc or '+' in vre):
        return True
    elif re.search(rgx, esbl) or re.search(rgx, kpc) or re.search(rgx, vre):
        return True
    else:
        return False

In [36]:
# MDRO_POS_SMALL ANALYSIS

min_t = pd.Timedelta(days=MIN_T)
max_t = pd.Timedelta(days=MAX_T)
corrected_mdro_pos_small = pd.DataFrame(
    columns = ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-', 'MDRO'] + [
        'Future_Stool_ID', 'Future_SampleDate', 'Future_ESBLisolate', 'Future_KPCisolate', 'Future_VRE_+/-', 'MIN_T', 'MAX_T', 'persistence'
    ]
)

# Get a pointer to the location of each sample in mdro_pos to its location in metadata. 
small_stool_mdidx = md_small.index[md_small['Stool ID'].isin(mdro_pos_small['Stool ID'])]

# make sure the samples pointed to are the same
assert(all(md_small.loc[small_stool_mdidx, 'Stool ID'].values == mdro_pos_small['Stool ID'].values))

# md_small and mdro_pos_small are both ordered by ['Study ID', 'Sample Date'], and there is one sample per Study ID (patient) in mdro_pos_small
# Therefore the ith sample in mdro_stool_mdidx is from the same StudyID as the ith iterate of md_small.groupby('StudyID) 
for i, (sid, df) in enumerate(md_small.groupby('StudyID')):
    
    # For the the current patient (Study ID), 
    # get the sample selected in mdro_pos_small plus all samples collected after it
    df_sample_future = df.loc[small_stool_mdidx[i]:, ]
    
    # filter on the time window
    # Get the delta in days between the sample in mdro_pos and future sample
    delta_t = df_sample_future.loc[:, 'SampleDate'] - df.loc[small_stool_mdidx[i], 'SampleDate']
    
    # only select samples in the [MIN_T, MAX_T]  window
    t_filter = ((min_t <= delta_t) & (delta_t <= max_t))
    df_sample_future = df_sample_future.loc[t_filter, :]
    
    # assign persistence label based on closest collected future sample
    persistence = any_MDRO(df_sample_future.iloc[0, :])
    
    # store mdro_pos_small metadata
    corrected_mdro_pos_small.loc[len(corrected_mdro_pos_small), :] = \
        list(df.loc[small_stool_mdidx[i], ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [any_MDRO(df.loc[small_stool_mdidx[i], :])] + \
        list(df_sample_future.iloc[0, :][['Stool ID', 'SampleDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [MIN_T, MAX_T, persistence]
corrected_mdro_pos_small.to_csv('corrected_mdro_pos_small.csv')
corrected_mdro_pos_small.persistence.value_counts() 
    

persistence
True     39
False    10
Name: count, dtype: int64

In [38]:
# MDRO_POS_LARGE_ANALYSIS

min_t = pd.Timedelta(days=MIN_T)
max_t = pd.Timedelta(days=MAX_T)
corrected_mdro_pos_large = pd.DataFrame(
    columns = ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-', 'MDRO'] + [
        'Future_Stool_ID', 'Future_SampleDate', 'Future_ESBLisolate', 'Future_KPCisolate', 'Future_VRE_+/-', 'MIN_T', 'MAX_T', 'persistence'
    ]
)

# Get a pointer to the location of each sample in mdro_pos to its location in metadata. 
large_stool_mdidx = md_large.index[md_large['Stool ID'].isin(mdro_pos_large['Stool ID'])]

# make sure the samples pointed to are the same
assert(all(md_large.loc[large_stool_mdidx, 'Stool ID'].values == mdro_pos_large['Stool ID'].values))

# md_large and mdro_pos_large are both ordered by ['Study ID', 'Sample Date'], and there is one sample per Study ID (patient) in mdro_pos_large
# Therefore the ith sample in mdro_stool_mdidx is from the same StudyID as the ith iterate of md_large.groupby('StudyID) 
for i, (sid, df) in enumerate(md_large.groupby('StudyID')):
    
    # For the the current patient (Study ID), 
    # get the sample selected in mdro_pos_large plus all samples collected after it
    df_sample_future = df.loc[large_stool_mdidx[i]:, ]
    
    # filter on the time window
    # Get the delta in days between the sample in mdro_pos and future sample
    delta_t = df_sample_future.loc[:, 'SampleDate'] - df.loc[large_stool_mdidx[i], 'SampleDate']
    
    # only select samples in the [MIN_T, MAX_T]  window
    t_filter = ((min_t <= delta_t) & (delta_t <= max_t))
    df_sample_future = df_sample_future.loc[t_filter, :]
    
    # assign persistence label based on closest collected future sample
    persistence = any_MDRO(df_sample_future.iloc[0, :])
    
    # store mdro_pos_large metadata
    corrected_mdro_pos_large.loc[len(corrected_mdro_pos_large), :] = \
        list(df.loc[large_stool_mdidx[i], ['Stool ID', 'SampleDate', 'StudyID', 'TxDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [any_MDRO(df.loc[large_stool_mdidx[i], :])] + \
        list(df_sample_future.iloc[0, :][['Stool ID', 'SampleDate', 'ESBLisolate', 'KPCisolate', 'VRE +/-']]) + \
        [MIN_T, MAX_T, persistence]
corrected_mdro_pos_large.to_csv('corrected_mdro_pos_large.csv')
corrected_mdro_pos_large.persistence.value_counts()  
print(corrected_mdro_pos_large.to_string())

   Stool ID           SampleDate StudyID               TxDate                                    ESBLisolate       KPCisolate           VRE +/-   MDRO Future_Stool_ID    Future_SampleDate                                  Future_ESBLisolate Future_KPCisolate              Future_VRE_+/- MIN_T MAX_T persistence
0     30004  2014-04-07 00:00:00       5                  NaT                                            NaN            35022             35888   True           30011  2014-04-17 00:00:00                                                 NaN                No                       35892     3    28        True
1     30029  2014-05-20 00:00:00       6  2014-03-26 00:00:00                                             No               No             35834   True           30036  2014-06-04 00:00:00                                                  No                No                       35835     3    28        True
2     30006  2014-04-11 00:00:00       9  2014-04-07 00:00:00         