In [171]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
import os
import re
import itertools
from datetime import date
import datetime

We have two files:
1- Diabetes patients admission encounter diabetes_data.csv.
2- Mapping file that can be used to make the variables have clear meaning.

In [18]:
# Reading data
f = open('../data/raw/IDs_mapping.csv','r')
ids_mappings = f.read()
diabetes_data = pd.read_csv('../data/raw/diabetes_data.csv')



The mapping dataset is not typical. It has more than one table in once csv file separated by a commas. To use them for mapping the features later, I convert each table to a dictionary.
the logic:
take the mapping table "ids_mappings" text then extract the tables first. Then extract the rows and split them into rows. The individual elements will serve as either key or value in the final dictionary. 

In [147]:
'''
 IDs_mapping.csv has mutiple tables in one file which is not a typical use of CSV. So I decided to read it as text then creat lists to map the numeric value 
 in the feature dataframe to more understandable values.
'''
def convert_list_to_dict(lst):
    pairs = itertools.zip_longest(*[iter(lst)] * 2, fillvalue=None)
    dct = {key: value for key, value in pairs}
    return dct

# Split text to get tables
codes_mapping_tables = ids_mappings.lower().split('\n,\n')

# Extract tables' rows
mapping_tables_dict = {}
for mapping_table in codes_mapping_tables:
    #print(mapping_table)
    # Extract columns from rows
    rows = re.split('\n',mapping_table)
    clean_cols = []
    for row in rows:
        cols = row.split(',')
        if len(cols)>2:
            #print(cols)
            for i in range(2,len(cols)):
                cols[1]+=cols[i]
            cols = cols[:2]
        clean_cols += cols
    #print(clean_cols)
    clean_mapping_table = [x.replace('/','_').replace(' ','_').replace('.','').replace('"','') for x in clean_cols]
    #print(clean_mapping_table)
    mapping_tables_dict[ clean_cols[0]]=convert_list_to_dict(clean_mapping_table)

# Clean up the dictionary
del mapping_tables_dict['admission_source_id']['']
mapping_tables_dict['discharge_disposition_id']['18']= 'not_mapped'
mapping_tables_dict

{'admission_type_id': {'admission_type_id': 'description',
  '1': 'emergency',
  '2': 'urgent',
  '3': 'elective',
  '4': 'newborn',
  '5': 'not_available',
  '6': 'null',
  '7': 'trauma_center',
  '8': 'not_mapped'},
 'discharge_disposition_id': {'discharge_disposition_id': 'description',
  '1': 'discharged_to_home',
  '2': 'discharged_transferred_to_another_short_term_hospital',
  '3': 'discharged_transferred_to_snf',
  '4': 'discharged_transferred_to_icf',
  '5': 'discharged_transferred_to_another_type_of_inpatient_care_institution',
  '6': 'discharged_transferred_to_home_with_home_health_service',
  '7': 'left_ama',
  '8': 'discharged_transferred_to_home_under_care_of_home_iv_provider',
  '9': 'admitted_as_an_inpatient_to_this_hospital',
  '10': 'neonate_discharged_to_another_hospital_for_neonatal_aftercare',
  '11': 'expired',
  '12': 'still_patient_or_expected_to_return_for_outpatient_services',
  '13': 'hospice___home',
  '14': 'hospice___medical_facility',
  '15': 'discharged_t

2- After reading the dataset diabetes_data, we start exploring the following:
- Variables that make sense to use.
In the table below, discharge_disposition_id column will not be used because the model is going to be used in production before discharge. 
- Birth_date can be used to create age during this inpatient encounter (age = admit_date - birth_date )
- Encounter_id & patient_nbr are going to be used for data preparation but not as features. 


In [148]:
# Display the dataframe information 
diabetes_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   weight                    101766 non-null  object
 5   admission_type_id         101766 non-null  int64 
 6   discharge_disposition_id  101766 non-null  int64 
 7   admission_source_id       101766 non-null  int64 
 8   time_in_hospital          101766 non-null  int64 
 9   payer_code                101766 non-null  object
 10  medical_specialty         101766 non-null  object
 11  num_lab_procedures        101766 non-null  int64 
 12  num_procedures            101766 non-null  int64 
 13  num_medications           101766 non-null  int64 
 14  numb

In [154]:
diabetes_data['admit_date']

0         2002-10-22 06:30:00
1         2004-07-24 08:35:00
2         2005-11-30 06:35:00
3         2003-05-26 00:32:00
4         2004-04-25 04:04:00
                 ...         
101761    2008-08-10 03:37:00
101762    2008-10-26 09:19:00
101763    2007-10-13 05:08:00
101764    2009-02-19 21:53:00
101765    2007-06-14 15:49:00
Name: admit_date, Length: 101766, dtype: object

In [151]:
# Removing discharge_disposition_id from the data since it is not going to be used
diabetes_data.drop(axis=1, labels=['discharge_disposition_id'], inplace=True)

In [212]:
# Calculate age by first converting the columns from object to datetime64 
diabetes_data['admit_date'] = pd.to_datetime(diabetes_data['admit_date'])
diabetes_data['birth_date'] = pd.to_datetime(diabetes_data['birth_date'])
diabetes_data['admit_date'] = diabetes_data['admit_date'].map(datetime.datetime.date)
diabetes_data['birth_date'] = diabetes_data['birth_date'].map(datetime.datetime.date)
diabetes_data['age'] = ((diabetes_data['admit_date']- diabetes_data['birth_date']).apply(lambda x: x.days)/365.2425)
