In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [2]:
#### MUST DELETE
#import warnings
#warnings.filterwarnings('ignore')
#pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# Set up paths & import functions
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')
sys.path.insert(0, src_folder)
from stats_and_visualisations import *
from patient_selection import *
from utilities import *
from modeling import *

In [4]:
### --- OPTIONS
optional_exclusions = ['first_diagnosis_only', 'exclude_newborns', 'exclude_deaths']
match_on = ['age_adm_bucket', 'gender']
profile_data = ['age_adm_bucket', 'gender']
show_graphs = True
test_size = 0.2

In [5]:
def select_patients_and_select_chartevents(diagnosis_id, diagnosis_name,
                                           test_size=0.33,
                                           optional_exclusions=None,
                                           profile_data=None,
                                           match_on=False,
                                           show_graphs=False):
    
    '''

    Function that for a given dignosis idc9 code:
        1) Finds patients who were diagnosed with the condition (target=1) and a 'base' group who were never diagnosed
           with the condition (target=0)
        2) Adds the final chart and lab events for each admission, and (optionally) additional patient demographic
           data such as gender and age. It also optionally provides visualisations of the chart, lab and demographic
           data so that comparisons can be seen between the subject and base groups
        3) Finally, it splits the data into a training set and a test set, both of which are saved on AWS S3
    
    The arguments are as follows:
        1) diagnosis_id: the icd9 code of the diagnosis that we wish to find patients for, with an appropriate base
           group who never had the diagnosis.
        2) diagnosis_name: the corresponding name for the diagnosis_id. Used to name the output training and test
           sets, so can be in short form/ easy to understand language.
        3) test_size: the proportion of total patients that should be placed in the test dataset (between 0 and 1).
           The default is 0.33.
        4) optional_exclusions: There are additional optional exlusions that can be applied to the subject and
           base group. These should be passed as a list into the optional_exclusions argument. This argument can
           be omitted if no exclusions are needed:
              a) first_diagnosis_only - this means that only one admission per patient
                 will be included in the output dataframes. In the subject dataframe,
                 the first admission where they were diagnosed with the condition will
                 be included (not necessarily their first admission overall, if they 
                 were not diagnosed on their first admission)
              b) exclude_newborns - excludes all admissions with admission_type ==
                 'NEWBORN'
              c) exclude_deaths - excludes all admissions that resulted in the
                 patient dying
        5) profile_data: the patient demographic data that is required in the final datasets. Needs to be passed
           as a list, eg ['gender', 'age_adm_bucket']. Must be a column in the admission_diagnosis_table dataset.
        6) match_on: The subject and base groups can be matched on their patient demographic data with the match_on
           argument. The match can take place on any demographic data in the input dataframe, and the chosen columns
           for the match should be passed as a list, eg ['gender', 'ethnicity_simple'].
           
           The match works by randomly sampling the base group so the proportions in each demographic bucket match
           the proportions in the subject group. For example, if the proportion of males to females in the
           subject group is 60/40 whereas in the base group it is 50/50, then the base group will be randomly
           sampled so that the male to female ratio is also 60/40.
           
           The match is taken on all combinations of matched columns together rather than separately. Eg, if 10% of
           the sampled group is white female, then the base group will be sampled so that 10% are also white female,
           rather than sampling ethnicity and gender separately. Therefore, it's recommended to only match on groups
           where there are large volumes in each bucket, otherwise the base group could become quite small
           
        7) show_graphs: if True, graphs are output which show comparisons between the subject and base groups
           for chart, lab and demographic data

    '''
    
    df = select_test_groups(diagnosis_id, optional_exclusions=optional_exclusions,
                            match_on=match_on, show_graphs=show_graphs)
                            
    df = add_chart_data(df)

    if profile_data:
        df = add_profile_data(df, profile_data=profile_data)
        non_chart_cols = ['subject_id', 'hadm_id', 'target'] +  profile_data
    else:
        non_chart_cols = ['subject_id', 'hadm_id', 'target']
    
    if show_graphs:
        # Plot a KDE for all remaining cols
        cols = [c for c in df.columns if c not in non_chart_cols]
        for c in cols:
            plot_KDE(df, 'target', 1, 0, c)
            
    # Create dummy variables for categorical variables so that ML models can be used
    df = pd.get_dummies(df)
    
    # Shuffle and reset index so that the subject and base groups are mixed together
    df = df.sample(frac=1).reset_index(drop=True)
            
    # Take test and train splits
    train, test = train_test_split(df, test_size=test_size, shuffle=True, random_state=8)
    
    print("--> Training set counts: ","\n",train.target.value_counts())
    print("--> Test set counts: ","\n",test.target.value_counts())
    
    # Do final cleaning
    X_train, X_test, y_train, y_test, feature_names = final_cleaning(ids = ['subject_id', 'hadm_id'],
                                                                     target ='target', train=train, test=test)
    
    # Export to csv
    to_s3(obj=X_train, bucket='mimic-jamesi', filename='{}_X_train.npy'.format(diagnosis_name))
    to_s3(obj=X_test, bucket='mimic-jamesi', filename='{}_X_test.npy'.format(diagnosis_name))
    to_s3(obj=y_train, bucket='mimic-jamesi', filename='{}_y_train.npy'.format(diagnosis_name))
    to_s3(obj=y_test, bucket='mimic-jamesi', filename='{}_y_test.npy'.format(diagnosis_name))
    to_s3(obj=feature_names, bucket='mimic-jamesi', filename='{}_feature_names.npy'.format(diagnosis_name))
    
    del df, train, test

In [6]:
def select_test_groups(diagnosis,
                       optional_exclusions=None,
                       match_on=False,
                       show_graphs=False):
    
    '''
    
    For a given diagnoses icd9 code, returns a single dataframe showing patients and admissions that either did or
    didn't have the disgnosis (denoted by target == 1 or 0) 
    
    '''

    # Find initial subject and base group for the given diagnosis
    subject_adm, base_adm = get_diagnosis_groups(diagnosis, optional_exclusions=optional_exclusions)
    
    if match_on:
        base_adm = take_match_control(subject_adm, base_adm, match_on=match_on)

    # Combine into a single DF
    subject_adm['target'] = 1
    base_adm['target'] = 0
    df = subject_adm.append(base_adm).reset_index(drop=True)

    if show_graphs:
        graph_comparisons(df = df, ids = 'hadm_id', group_col = 'target', group_a = 1, group_b = 0)

    df['subject_id'] = df['subject_id'].astype(int)
    df['hadm_id'] = df['hadm_id'].astype(int)
    
    df = df[['subject_id', 'hadm_id', 'target']]

    return df

In [7]:
def take_match_control(subject_adm, base_adm, match_on):
    
    '''
    
    For a given subject and base group, returns a new base group that is identical in proportions for
    given variables compared to the subject group.
    
    '''
    
    # === 1 === For the subject and base groups, calculate the proportion of patients in each combination of the
    #           match_on variables. This is so the proportions can be compared, and ultimately the base group can
    #           be sampled until it's proportions are equal to the subject proportions
    
    # Subjects
    subject_segments = (subject_adm.groupby(match_on)
                                   .agg({'hadm_id':'nunique'})
                                   .rename(columns={'hadm_id':'subjects_n'})
                                   .reset_index())
    subject_segments['subjects_prop'] = subject_segments['subjects_n'] / subject_segments['subjects_n'].sum()

    # Base
    base_segments = (base_adm.groupby(match_on)
                             .agg({'hadm_id':'nunique'})
                             .rename(columns={'hadm_id':'base_n'})
                             .reset_index())
    base_segments['base_prop'] = base_segments['base_n'] / base_segments['base_n'].sum()

    proportions_compare = pd.merge(subject_segments, base_segments, how='outer',
                                   left_on=match_on, right_on=match_on)

    # === 2 === Compare proportions: For each combination, the proportion % of the base and the subject group should
    #           be compared. The goal is to find the combination where there is the lowest ratio of base group to
    #           subject group. This is because this is the combination group that cannot be down sampled any further
    #           if we want to maximise the size of the base group. Therefore, if we know the size of this combination
    #           group we can use it as a basis for calculating the target size of all other combination groups
    
    proportions_compare['ratio'] = proportions_compare['base_prop'] / proportions_compare['subjects_prop']
    lowest = proportions_compare[proportions_compare['ratio']==proportions_compare['ratio'].min()]
    total_sample_size = math.floor(lowest['base_n'] / lowest['subjects_prop'])
    proportions_compare['new_base_grp_size'] = ((total_sample_size * proportions_compare['subjects_prop'])
                                                .apply(np.floor))

    # === 3 === With the target group size known for each combination, loop through each combination and randomly
    #           sample from the base group the desired number of admissions
                                
    base_adm_sampled = df_empty(columns=base_adm.columns.tolist(), dtypes=base_adm.dtypes.tolist())

    for idx,row in proportions_compare.iterrows():
        
        tmp_base = base_adm.copy()
        
        n = int(row['new_base_grp_size'])
        
        for val in match_on:
            tmp_base = tmp_base[tmp_base[val]==row[val]]
                    
        sample_df = tmp_base.sample(n=n, random_state=8)

        base_adm_sampled = base_adm_sampled.append(sample_df)
        
        del sample_df, tmp_base

    print('--> Original base group size: ', len(base_adm))
    print('--> Sampled base group size: ', len(base_adm_sampled))
    print('--> Subject group size: ', len(subject_adm))
    
    return base_adm_sampled

In [8]:
select_patients_and_select_chartevents('5849', 'acute_kidney_failure',
                                       show_graphs=show_graphs,
                                       test_size=test_size,
                                       match_on=match_on,
                                       optional_exclusions=optional_exclusions,
                                       profile_data=profile_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  subject_adm.drop(columns=['diagnosis_name', 'diagnosis_icd9'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  subject_adm.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  base_adm.drop(columns=['diagnosis_name', 'diagnosis_icd9'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  base_adm.drop_duplicates(inplace=True)


--> Original base group size:  27967
--> Sampled base group size:  9479
--> Subject group size:  6096
Index(['subject_id', 'hadm_id', 'target', 'Admission weight', 'Anion Gap',
       'BP diastolic', 'BP mean', 'BP systolic', 'BUN', 'Basophils',
       'Bicarbonate', 'Calcium (Total)', 'Chloride', 'Creatinine',
       'Eosinophils', 'Glucose', 'HR', 'Hematocrit', 'Hemoglobin', 'Lactate',
       'Lymphocytes', 'MCH', 'MCHC', 'MCV', 'Magnesium', 'Monocytes',
       'Neutrophils', 'Oxygen saturation', 'PCO2', 'PO2', 'PTT', 'Phosphorus',
       'Platelet Count', 'Potassium', 'RDW', 'Red Blood Cells',
       'Respiratory rate', 'Sodium', 'Temperature F', 'Urea Nitrogen',
       'White blood cells', 'pH', 'age_adm_bucket', 'gender'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'target', 'Admission weight', 'Anion Gap',
       'BP diastolic', 'BP mean', 'BP systolic', 'BUN', 'Basophils',
       'Bicarbonate', 'Calcium (Total)', 'Chloride', 'Creatinine',
       'Eosinophils', 'Glucos