## Combining Datasets

#### - This script combines datasets provided from EPIC. eg. including Visit IDs for each susceptibility profile
#### - This will need to be updated each time a new dataset is incorporated
#### _

In [1]:
# Import Modules

import pandas as pd
import numpy as np


In [2]:
# Load Datasets

microlab = pd.read_csv('../Data/2017-10-05 - Micro_Bacterium_Report-Deidentified.txt', sep = "\t", low_memory = False)
encounters = pd.read_csv('../Data/2017-10-05 - Encounters_Report-Deidentified.txt', sep = "\t", low_memory = False)
medications = pd.read_csv('../Data/2017-10-05 - Medications_Report-Deidentified.txt', sep = "\t", low_memory = False)


#### Combine Encounters into Microlab report
#### _

In [None]:
# Match hospital admission, discharge and VISIT IDs with the microbiology data (Run on MINERVA)

# Subset the Microlab data to reduce the memory burden
microlab_subset = microlab[[
        'ID',
        'SPECIMEN_TAKEN_DATE',
    ]]

# Subset the encounters data to reduce memory burden
encounters_subset = encounters[[
        'ID',
        'AGE',
        'SEX',
        'BMI',
        'DEPARTMENT_NAME',
        'ENCOUNTER_TYPE',
        'SOCIAL_HX_TOBACCO_USER',
        'HOSP_ADMSN_TIME',
        'HOSP_DISCHRG_TIME',
        'VISIT_ID'
    ]]

# Merge the two subset datasets together to create a VISIT_ID list
visit_IDs = pd.merge(microlab_subset, encounters_subset, how = 'left', on = 'ID') # We only care about Patients that have Microlab results, therefore "LEFT"
visit_IDs = visit_IDs[(visit_IDs['HOSP_ADMSN_TIME'] <= visit_IDs['SPECIMEN_TAKEN_DATE']) & (visit_IDs['HOSP_DISCHRG_TIME'] >= visit_IDs['SPECIMEN_TAKEN_DATE'])]
visit_IDs = visit_IDs.drop_duplicates()

# Merge the Visit IDs to the original Microlab data
full_microlab_data = pd.merge(microlab, visit_IDs, how = 'left', on = ['ID', 'SPECIMEN_TAKEN_DATE'])

# Save the data
full_microlab_data.to_csv('full_microlab_data.txt', sep='\t', index=False)


#### Combine Microlab into Medications report
#### _

In [None]:
# Match hospital admission, discharge and VISIT IDs with the medications data (Run on MINERVA)

# Subset the full_medications_data to reduce memory burden. Using this dataset (and not encounters) because we want the matching organism information
microlab_subset = full_microlab_data[[
        'VISIT_ID',
        'HOSP_ADMSN_TIME',
        'HOSP_DISCHRG_TIME',
        'Organism',
        'SPECIMEN_TAKEN_DATE'
        ]]

microlab_subset = microlab_subset.drop_duplicates()

# Merge the Visit IDs to the original Microlab data
full_medications_data = pd.merge(medications, microlab_subset, how = 'left', on=['VISIT_ID']) # Merge left because we want all of the medication data.

# Save the data
full_medications_data.to_csv('full_medications_data.txt', sep='\t', index=False)
