In [1]:
# Import Modules

import pandas as pd
import numpy as np
import datetime as dt


In [2]:
# Load Datasets

microlab = pd.read_csv('../Data/2017-10-05 - full_microlab_data.txt', sep = "\t", low_memory = False)
medications = pd.read_csv('../Data/2017-10-05 - full_medications_data.txt', sep = "\t", low_memory = False)
dose_concentration = pd.read_csv('../Data/DOSE_Concentration.txt', sep = "\t", low_memory = False)


### ESKAPE Pathogens
### _

In [3]:
# Extract ESKAPE Pathogens.


# ESKAPE Pathogens cited in literatureL:
    #Enterococcus faecium
    #Staphylococcus aureus
    #Klebsiella pneumoniae
    #Acinetobacter baumannii
    #Pseudomonas aeruginosa
    #Enterobacter spp. (ENTEROBACTER AEROGENES, ENTEROBACTER CLOACAE)

eskape = [
    "ACINETOBACTER BAUMANNII",
    "ENTEROCOCCUS FAECALIS",
    "ENTEROCOCCUS FAECIUM",
    "STAPHYLOCOCCUS AUREUS",
    "KLEBSIELLA PNEUMONIAE",
    "PSEUDOMONAS AERUGINOSA",
    "ENTEROBACTER CLOACAE",
    "ENTEROBACTER AEROGENES"
    ]

# Clean Organism names
microlab = microlab.replace("ACINETOBACTER BAUMANNII COMPLEX", "ACINETOBACTER BAUMANNII")
microlab = microlab.replace("ESCHERICHIA COLI 0157:H7", "ESCHERICHIA COLI")
microlab = microlab.replace("ESCHERICHIA COLI, AN ESBL PRODUCER", "ESCHERICHIA COLI")
microlab = microlab.replace("HVISA POSITIVE STAPHYLOCOCCUS AUREUS-OXACILLIN RESISTANT", "STAPHYLOCOCCUS AUREUS")
microlab = microlab.replace("HVISA POSITIVE STAPHYLOCOCCUS AUREUS-OXACILLIN SUSCEPTIBLE", "STAPHYLOCOCCUS AUREUS")
microlab = microlab.replace("STAPHYLOCOCCUS AUREUS-OXACILLIN RESISTANT", "STAPHYLOCOCCUS AUREUS")
microlab = microlab.replace("STAPHYLOCOCCUS AUREUS-OXACILLIN SUSCEPTIBLE", "STAPHYLOCOCCUS AUREUS")
microlab = microlab.replace("KLEBSIELLA PNEUMONIAE/OXYTOCA", "KLEBSIELLA PNEUMONIAE")
microlab = microlab.replace("PSEUDOMONAS AERUGINOSA-MUCOID PHENOTYPE", "PSEUDOMONAS AERUGINOSA")
microlab = microlab.replace("ENTEROBACTER CLOACAE SSP. CLOACAE", "ENTEROBACTER CLOACAE")

medications = medications.replace("ACINETOBACTER BAUMANNII COMPLEX", "ACINETOBACTER BAUMANNII")
medications = medications.replace("ESCHERICHIA COLI 0157:H7", "ESCHERICHIA COLI")
medications = medications.replace("ESCHERICHIA COLI, AN ESBL PRODUCER", "ESCHERICHIA COLI")
medications = medications.replace("HVISA POSITIVE STAPHYLOCOCCUS AUREUS-OXACILLIN RESISTANT", "STAPHYLOCOCCUS AUREUS")
medications = medications.replace("HVISA POSITIVE STAPHYLOCOCCUS AUREUS-OXACILLIN SUSCEPTIBLE", "STAPHYLOCOCCUS AUREUS")
medications = medications.replace("STAPHYLOCOCCUS AUREUS-OXACILLIN RESISTANT", "STAPHYLOCOCCUS AUREUS")
medications = medications.replace("STAPHYLOCOCCUS AUREUS-OXACILLIN SUSCEPTIBLE", "STAPHYLOCOCCUS AUREUS")
medications = medications.replace("KLEBSIELLA PNEUMONIAE/OXYTOCA", "KLEBSIELLA PNEUMONIAE")
medications = medications.replace("PSEUDOMONAS AERUGINOSA-MUCOID PHENOTYPE", "PSEUDOMONAS AERUGINOSA")
medications = medications.replace("ENTEROBACTER CLOACAE SSP. CLOACAE", "ENTEROBACTER CLOACAE")

# Subset ESKAPE
microlab = microlab[microlab['Organism'].isin(eskape)]
medications = medications[medications['Organism'].isin(eskape)]    


### Microlab Data Cleaning
### _

In [4]:
# Clean Microlab Data

# Clean the Susceptibility Results
#microlab = microlab.loc[microlab['Source2'] == "Blood"]
microlab = microlab.replace('+', "Resistant")
microlab = microlab.replace('POS', "Resistant")
microlab = microlab.replace('Intermediate', "Resistant")
microlab = microlab.replace('Not Susceptible', "Resistant")
microlab = microlab.replace('', "Susceptible")
microlab = microlab.replace('-', "Susceptible")
microlab = microlab.replace('NEG', "Susceptible")
microlab = microlab.replace('Susceptible dose dependent', "Susceptible")
microlab = microlab.dropna(subset = ['Suscept'])

# Fix Abx Names
microlab['Abx Name'] = microlab['Abx Name'].replace('Cefazolin.', 'Cefazolin')
microlab['Abx Name'] = microlab['Abx Name'].replace('Cefazolin-Urine', 'Cefazolin')
microlab['Abx Name'] = microlab['Abx Name'].replace('Trimethroprim-sulfamethazole', 'Trimethoprim/Sulfamethoxazole')

# Select combinations of pathogens and Vitek tests where at least 25% of isolates were tested
output = pd.DataFrame([])
for organism in microlab['Organism'].unique(): 
    subset = microlab.loc[microlab['Organism'] == organism]
    length = len(subset['ACC_NUM'].unique())
    
    for agent in subset['Abx Name'].unique():
        subset2 = subset.loc[subset['Abx Name'] == agent]
        length2 = len(subset2['ACC_NUM'].unique())
        ratio = (length2/length)*100
        
        if ratio >= 25:
            output = output.append(pd.DataFrame({'Organism': organism,
                                                 'Abx Name': agent,
                                                 'Total Count': length,
                                                 'Abx Length': length2,
                                                 'Ratio': ratio},
                                                index=[0]),
                                   ignore_index=True)
output = output[['Organism', 'Abx Name']]
microlab = pd.merge(microlab, output, on = ['Organism', 'Abx Name'], how = 'right')

# Clean the MICs
# Remove Symbols
microlab['SENSITIVITY_VALUE'] = microlab['SENSITIVITY_VALUE'].replace('>', '')
microlab['SENSITIVITY_VALUE'] = microlab['SENSITIVITY_VALUE'].replace('<', '')
microlab['SENSITIVITY_VALUE'] = microlab['SENSITIVITY_VALUE'].replace('=', '')

# # Binary tests, like GENT Synergy
microlab.loc[(microlab['Suscept'] == 'Susceptible') & (microlab['Abx Name'] == 'Gent. Synergy'), 'SENSITIVITY_VALUE'] = '0'
microlab.loc[(microlab['Suscept'] == 'Resistant') & (microlab['Abx Name'] == 'Gent. Synergy'), 'SENSITIVITY_VALUE'] = '1'
microlab.loc[(microlab['Suscept'] == 'Susceptible') & (microlab['Abx Name'] == 'Strep. Synergy'), 'SENSITIVITY_VALUE'] = '0'
microlab.loc[(microlab['Suscept'] == 'Resistant') & (microlab['Abx Name'] == 'Strep. Synergy'), 'SENSITIVITY_VALUE'] = '1'
microlab.loc[(microlab['Suscept'] == 'Susceptible') & (microlab['Abx Name'] == 'ESBL Scrn'), 'SENSITIVITY_VALUE'] = '0'
microlab.loc[(microlab['Suscept'] == 'Resistant') & (microlab['Abx Name'] == 'ESBL Scrn'), 'SENSITIVITY_VALUE'] = '1'

# # Disk Measurements
microlab.loc[(microlab['Suscept'] == 'Susceptible') & (microlab['SENSITIVITY_VALUE'].str.contains('mm') == True), 'SENSITIVITY_VALUE'] = '0'
microlab.loc[(microlab['Suscept'] == 'Resistant') & (microlab['SENSITIVITY_VALUE'].str.contains('mm') == True), 'SENSITIVITY_VALUE'] = '1'

# # Other
microlab['SENSITIVITY_VALUE'] = microlab['SENSITIVITY_VALUE'].replace('I', 1.0)
microlab['SENSITIVITY_VALUE'] = microlab['SENSITIVITY_VALUE'].str.replace('-', '')
microlab['SENSITIVITY_VALUE'] = microlab['SENSITIVITY_VALUE'].str.replace('/', './')

# Convert to numeric format and drop the remainder (we assume there isn't enough information to work with)
microlab['SENSITIVITY_VALUE'] = pd.to_numeric(microlab['SENSITIVITY_VALUE'], errors='coerce')

# Process Data
# Order the data and keep the first sample from each patient visit
microlab['SPECIMEN_TAKEN_DATE'] = pd.to_datetime(microlab['SPECIMEN_TAKEN_DATE'], format = "%Y-%m-%d")
microlab = microlab.loc[microlab['SPECIMEN_TAKEN_DATE'] >= dt.date(2012, 1, 1)]
microlab = microlab.loc[microlab['SPECIMEN_TAKEN_DATE'] <= dt.date(2016, 12, 31)]
microlab['HOSP_DISCHRG_TIME'] = pd.to_datetime(microlab['HOSP_DISCHRG_TIME'], format = "%Y-%m-%d")
microlab['HOSP_ADMSN_TIME'] = pd.to_datetime(microlab['HOSP_ADMSN_TIME'], format = "%Y-%m-%d")
microlab = microlab.sort_values(by = ['ID', 'SPECIMEN_TAKEN_DATE', 'VISIT_ID'])
microlab = microlab.dropna(subset=['ID', 'Organism', 'Abx Name', 'VISIT_ID', 'SPECIMEN_TAKEN_DATE'])

# Convert the susceptibility profile into a number. Resistant = 1, 0 for everything else

def resistance(microlab):
    if microlab['Suscept'] == "Resistant":
        return 1.0
    else:
        return 0.0

# Because your dates do not start on the first of each year, convert the days into years by difference.

def quarter(microlab):
    quarter = int(( microlab['Day'] % 91.25 ) / 25)
    quarter = quarter * 0.25
    return ( (microlab['Day'] // 91.3125) + quarter )

def year(microlab):
    year = int(microlab['Day'] / 365.25)
    year = year + 1
    return year
    
microlab['Status_Numeric'] = microlab.apply(resistance, axis = 1)
microlab['Day'] = (microlab['SPECIMEN_TAKEN_DATE'] - dt.date(2012, 1, 1)).astype('timedelta64[D]')
microlab['Quarter'] = microlab.apply(quarter, axis =1)
microlab['Year'] = microlab.apply(year, axis = 1)

# Calculate LOS and HAI cases
microlab['LOS'] = (microlab['HOSP_DISCHRG_TIME'] - microlab['HOSP_ADMSN_TIME']).astype('timedelta64[D]')
microlab['HAI'] = (microlab['SPECIMEN_TAKEN_DATE'] - microlab['HOSP_ADMSN_TIME']).astype('timedelta64[D]')

ValueError: cannot set a Timestamp with a non-timestamp

### Medications Data Cleaning
### _

In [None]:
# Clean Medications Data

# Combine the separate files together
medications = pd.merge(medications, dose_concentration, on='MED_NAME_x', how='left')

# Drop rows with missing data
medications = medications.dropna(subset=['WHO_DOSE', 'SPECIMEN_TAKEN_DATE'])

# Convert all Dose into numeric structure
medications['DOSE'] = medications['DOSE'].str.split('-').str[0]
medications['DOSE'] = pd.to_numeric(medications['DOSE'])
medications['DOSE_CONCENTRATION'] = pd.to_numeric(medications['DOSE_CONCENTRATION'])

# Standardize Units for DOSE_UNIT that will NOT be calculated
medications.replace({'drop': 'Topical / Drop'}, regex=True)
medications.replace({'applicator': 'Topical / Drop'}, regex=True)
medications.replace({'strip': 'Topical / Drop'}, regex=True)
medications.replace({'%': 'Topical / Drop'}, regex=True)
medications.replace({'Squirt': 'Topical / Drop'}, regex=True)
medications.replace({'packet': 'Topical / Drop'}, regex=True)
medications.replace({'each': 'Topical / Drop'}, regex=True)
medications.replace({'tsp': 'Tablet'}, regex=True)
medications.replace({'Tube': 'Topical / Drop'}, regex=True)
medications.replace({'Dose': 'Topical / Drop'}, regex=True)
medications.replace({'Applicatorful': 'Applicator'}, regex=True)
medications.replace({'inch': 'Applicator'}, regex=True)

# DOSE_UNITS = g and DOSE > 1000 (Not happy about this...these are one off fixes without a secure method)
medications.loc[(medications['DOSE_UNIT'] == 'g') & (medications['WHO_DOSE_UNIT'] == 'g') & (medications['DOSE'] >= 10), 'DOSE'] = (medications['DOSE'] / 1000)
medications.loc[(medications['DOSE_UNIT'] == 'tablet') & (medications['WHO_DOSE_UNIT'] == 'g') & (medications['DOSE'] >= 10), 'DOSE'] = (medications['DOSE'] / 1000)
# DOSE_UNITS = mg
medications.loc[(medications['DOSE_UNIT'] == 'mg') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE'] = (medications['DOSE'] / 1000)
medications.loc[(medications['DOSE_UNIT'] == 'mg') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mL
medications.loc[(medications['DOSE_UNIT'] == 'mL'), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION'])/1000
medications.loc[(medications['DOSE_UNIT'] == 'mL'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = Tablets
medications.loc[(medications['DOSE_UNIT'] == 'tablet'), 'DOSE_UNIT'] = 'Tablet'
medications.loc[(medications['DOSE_UNIT'] == 'capsule'), 'DOSE_UNIT'] = 'Tablet'
medications.loc[(medications['DOSE_UNIT'] == 'packet'), 'DOSE_UNIT'] = 'Tablet'
medications.loc[(medications['DOSE_UNIT'] == 'tsp'), 'DOSE_UNIT'] = 'Tablet'
medications.loc[(medications['DOSE_UNIT'] == 'Tablet') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE'] = medications['DOSE'] * medications['DOSE_CONCENTRATION'] / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Tablet') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
medications.loc[(medications['DOSE_UNIT'] == 'Tablet') & (medications['WHO_DOSE_UNIT'] == 'mg'), 'DOSE'] = medications['DOSE'] * medications['DOSE_CONCENTRATION']
medications.loc[(medications['DOSE_UNIT'] == 'Tablet') & (medications['WHO_DOSE_UNIT'] == 'mg'), 'DOSE_UNIT'] = 'mg'
# DOSE_UNITS = Topical / Drops
medications.loc[(medications['DOSE_UNIT'] == 'drop'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == 'applicator'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == 'strip'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == '%'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == 'Squirt'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == 'each'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == 'Tube'), 'DOSE_UNIT'] = 'Topical / Drop'
medications.loc[(medications['DOSE_UNIT'] == 'Dose'), 'DOSE_UNIT'] = 'Topical / Drop'
# DOSE_UNITS = Million Units
medications.loc[(medications['DOSE_UNIT'] == 'Million Units'), 'DOSE_CONCENTRATION'] = 0.00059988
medications.loc[(medications['DOSE_UNIT'] == 'Million Units') & (medications['WHO_DOSE_UNIT'] == 'g') & (medications['DOSE'] >= 100.0), 'DOSE_CONCENTRATION'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION']) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Million Units') & (medications['WHO_DOSE_UNIT'] == 'g') & (medications['DOSE'] < 100.0), 'DOSE_CONCENTRATION'] = (medications['DOSE'] * 1000000 * medications['DOSE_CONCENTRATION']) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Million Units') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = Units
medications.loc[(medications['DOSE_UNIT'] == 'Units'), 'DOSE_CONCENTRATION'] = 0.00059988
medications.loc[(medications['DOSE_UNIT'] == 'Units') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION']) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Units') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg/kg/day
medications.loc[(medications['DOSE_UNIT'] == 'mg/kg/day') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE'] = (medications['DOSE'] * 62)/1000
medications.loc[(medications['DOSE_UNIT'] == 'mg/kg/day') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg/kg
medications.loc[(medications['DOSE_UNIT'] == 'mg/kg') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE'] = (medications['DOSE'] * 62)/1000
medications.loc[(medications['DOSE_UNIT'] == 'mg/kg') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg/mL
medications.loc[(medications['DOSE_UNIT'] == 'mg/mL') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION']) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'mg/mL') & (medications['WHO_DOSE_UNIT'] == 'g'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = g/day
medications.loc[(medications['DOSE_UNIT'] == 'g/day'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg of piperacillin
medications.loc[(medications['DOSE_UNIT'] == 'mg of piperacillin'), 'DOSE'] = medications['DOSE'] / 1000
medications.loc[(medications['DOSE_UNIT'] == 'mg of piperacillin'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg/hr
medications.loc[(medications['DOSE_UNIT'] == 'mg/hr'), 'DOSE'] = medications['DOSE'] / 1000
medications.loc[(medications['DOSE_UNIT'] == 'mg/hr'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg of ampicillin
medications.loc[(medications['DOSE_UNIT'] == 'mg of ampicillin'), 'DOSE'] = medications['DOSE'] / 1000
medications.loc[(medications['DOSE_UNIT'] == 'mg of ampicillin'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg of mg/day
medications.loc[(medications['DOSE_UNIT'] == 'mg/day'), 'DOSE'] = medications['DOSE'] / 1000
medications.loc[(medications['DOSE_UNIT'] == 'mg/day'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = Units/kg/day
medications.loc[(medications['DOSE_UNIT'] == 'Units/kg/day'), 'DOSE_CONCENTRATION'] = 0.00059988
medications.loc[(medications['DOSE_UNIT'] == 'Units/kg/day'), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION'] * 62) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Units/kg/day'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = Units/kg
medications.loc[(medications['DOSE_UNIT'] == 'Units/kg'), 'DOSE_CONCENTRATION'] = 0.00059988
medications.loc[(medications['DOSE_UNIT'] == 'Units/kg'), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION'] * 62) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Units/kg'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = Million Units/day
medications.loc[(medications['DOSE_UNIT'] == 'Million Units/day'), 'DOSE_CONCENTRATION'] = 0.00059988
medications.loc[(medications['DOSE_UNIT'] == 'Million Units/day') & (medications['DOSE'] >= 100.0), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION']) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Million Units/day') & (medications['DOSE'] < 100.0), 'DOSE'] = (medications['DOSE'] * medications['DOSE_CONCENTRATION'] * 1000000) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'Million Units/day'), 'DOSE_UNIT'] = 'g'
# DOSE_UNITS = mg/kg of piperacillin
medications.loc[(medications['DOSE_UNIT'] == 'mg/kg of piperacillin'), 'DOSE'] = (medications['DOSE'] * 62) / 1000
medications.loc[(medications['DOSE_UNIT'] == 'mg/kg of piperacillin'), 'DOSE_UNIT'] = 'g'

# Fix Date Formats
medications['SPECIMEN_TAKEN_DATE'] = pd.to_datetime(medications['SPECIMEN_TAKEN_DATE'])
medications['ORDER_START_TIME'] = pd.to_datetime(medications['ORDER_START_TIME'])
medications['ORDER_END_TIME'] = pd.to_datetime(medications['ORDER_END_TIME'])
medications['HOSP_ADMSN_TIME'] = pd.to_datetime(medications['HOSP_ADMSN_TIME'])
medications['HOSP_DISCHRG_TIME'] = pd.to_datetime(medications['HOSP_DISCHRG_TIME'])
medications = medications.loc[medications['ORDER_START_TIME'] >= dt.date(2012, 1, 1)] # We are doing this because there is missing data in 2010 and 2011
medications = medications.loc[medications['ORDER_START_TIME'] <= dt.date(2016, 12, 31)]

# Calculate DDD
medications['Total_ABX_Consumption'] = medications['DOSE'] * medications['FREQ_ADJ']
medications['DDD'] = medications['Total_ABX_Consumption'] / medications['WHO_DOSE']
medications = medications.replace([np.inf, -np.inf], np.nan)
medications = medications.dropna(subset=['DDD'])

# Restack the dataframe
medications = medications[[
        'ORDER_START_TIME',
        'ORDER_END_TIME',
        'ID',
        'VISIT_ID',
        'Organism',
        'Antibiotic-Treatment',
        'WHO_ROUTE',
        'Total_ABX_Consumption',
        'DDD',
        'HOSP_ADMSN_TIME',
        'HOSP_DISCHRG_TIME',
        'SPECIMEN_TAKEN_DATE'
    ]]

medications = pd.concat([
        
        pd.DataFrame({
                'Date': pd.date_range(row['ORDER_START_TIME'], row['ORDER_END_TIME']),
                'ID': row['ID'],
                'VISIT_ID': row['VISIT_ID'],
                'Organism': row['Organism'],
                'Antibiotic-Treatment': row['Antibiotic-Treatment'],
                'WHO_ROUTE': row['WHO_ROUTE'],
                'Total_ABX_Consumption': row['Total_ABX_Consumption'],
                'DDD': row['DDD'],
                'ORDER_START_TIME': row['ORDER_START_TIME'],
                'ORDER_END_TIME': row['ORDER_END_TIME'],
                'HOSP_ADMSN_TIME': row['HOSP_ADMSN_TIME'],
                'HOSP_DISCHRG_TIME': row['HOSP_DISCHRG_TIME'],
                'SPECIMEN_TAKEN_DATE': row['SPECIMEN_TAKEN_DATE']                
                     },
                     
                     columns=[
                'Date',
                'ID',
                'VISIT_ID',
                'Organism',
                'Antibiotic-Treatment',
                'WHO_ROUTE',
                'Total_ABX_Consumption',
                'DDD',
                'ORDER_START_TIME',
                'ORDER_END_TIME',
                'HOSP_ADMSN_TIME',
                'HOSP_DISCHRG_TIME',
                'SPECIMEN_TAKEN_DATE'
                            ]
                    )
        
           for i, row in medications.iterrows()],
                        ignore_index=True)

# Because your dates do not start on the first of each year, convert the days into years by difference.

def quarter(medications):
    quarter = int(( medications['Day'] % 91.25 ) / 25)
    quarter = quarter * 0.25
    return ( (medications['Day'] // 91.3125) + quarter )

def year(medications):
    year = int(medications['Day'] / 365.25)
    year = year + 1
    return year
    
medications['Day'] = (medications['Date'] - dt.date(2012, 1, 1)).astype('timedelta64[D]')
medications['Quarter'] = medications.apply(quarter, axis =1)
medications['Year'] = medications.apply(year, axis = 1)

# Calculate LOS, LOT and HAI cases
medications['LOT'] = (medications['ORDER_END_TIME'] - medications['ORDER_START_TIME']).astype('timedelta64[D]')
medications['LOS'] = (medications['HOSP_DISCHRG_TIME'] - medications['HOSP_ADMSN_TIME']).astype('timedelta64[D]')
medications['HAI'] = (medications['SPECIMEN_TAKEN_DATE'] - medications['HOSP_ADMSN_TIME']).astype('timedelta64[D]')

### Write output files
### _

In [9]:
# Save the data
microlab.to_csv('clean_microlab_data.txt', sep='\t', index=False)
medications.to_csv('clean_medications_data.txt', sep='\t', index=False)