In [3]:
# importing relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px

In [29]:
# importing the raw data as a DataFrame
rawdfcsv = pd.read_csv('../data/raw/hospitaldf.csv')

In [30]:
# converting data to parquet file for efficiency
rawdfcsv.to_parquet('rawdata.parquet')

In [41]:
# reading the data
rawdfparquet = pd.read_parquet('../data/raw/rawdata.parquet')

In [42]:
# Reading the dataset to check formatting
rawdfparquet.head()

Unnamed: 0.1,Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,1,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Pentecostal,Widowed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


There seems to be an index column already in the dataset. We will load the data specifying the index as the 'Unnamed: 0' column.

In [45]:
# setting the index to be the 'Unnamed: 0' column
rawdfparquet.set_index('Unnamed: 0', inplace=True)

In [50]:
# checking that the index is now as desired
rawdfparquet.head()

Unnamed: 0_level_0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Pentecostal,Widowed,Retired,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# checking the columns that we have
list(rawdfparquet.columns)

['dep_name',
 'esi',
 'age',
 'gender',
 'ethnicity',
 'race',
 'lang',
 'religion',
 'maritalstatus',
 'employstatus',
 'insurance_status',
 'disposition',
 'arrivalmode',
 'arrivalmonth',
 'arrivalday',
 'arrivalhour_bin',
 'previousdispo',
 '2ndarymalig',
 'abdomhernia',
 'abdomnlpain',
 'abortcompl',
 'acqfootdef',
 'acrenlfail',
 'acutecvd',
 'acutemi',
 'acutphanm',
 'adjustmentdisorders',
 'adltrespfl',
 'alcoholrelateddisorders',
 'allergy',
 'amniosdx',
 'analrectal',
 'anemia',
 'aneurysm',
 'anxietydisorders',
 'appendicitis',
 'artembolism',
 'asppneumon',
 'asthma',
 'attentiondeficitconductdisruptivebeha',
 'backproblem',
 'biliarydx',
 'birthasphyx',
 'birthtrauma',
 'bladdercncr',
 'blindness',
 'bnignutneo',
 'bonectcncr',
 'bph',
 'brainnscan',
 'breastcancr',
 'breastdx',
 'brnchlngca',
 'bronchitis',
 'burns',
 'cardiaarrst',
 'cardiacanom',
 'carditis',
 'cataract',
 'cervixcancr',
 'chestpain',
 'chfnonhp',
 'chrkidneydisease',
 'coaghemrdx',
 'coloncancer',
 'com

Our target variable is 'esi'. The project will investigate the effect of various columns (feature variables) on 'esi', and attempt to predict the 'esi' score from inputs for selected feature variables. 

In order to select the feature variables, we will first separate them into groups, and investigate their effect on 'esi' within each group. 

In [52]:
# checking the possible values of the target variable: 'esi'
rawdf['esi'].astype(str).value_counts()


esi
3.0    236229
2.0    163534
4.0    125003
5.0     27992
1.0      5271
nan      2457
Name: count, dtype: int64

First we will select the group 'Demographic', that will comtain all feature variables relating to a patients deographic.

It seems that the first 11 columns, minus the 'esi' column, are related to someones demographic.

In [57]:
# creating a DataFrame that only includes feature variables relating to a patient's demopgraphic
rawdemdf = rawdfparquet.iloc[:, 0:10]

In [63]:
# checking to see the demographic dataframe is as desired
rawdemdf.head()

Unnamed: 0_level_0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time
2,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed
3,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed
4,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed
5,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Pentecostal,Widowed,Retired


In [64]:
# saving the demographic dataframe to a parquet file
rawdemdf.to_parquet('rawdemdata.parquet')

The next group we will separate will be the 'cc_' group.

In [68]:
# creating a new dataframe with the relevant columns
rawccdf = rawdfparquet.loc[:, 'cc_abdominalcramping' : 'cc_wristpain']

In [69]:
# adding the target variable to the datafram
rawccdf['esi'] = rawdfparquet['esi']

In [72]:
# saving the cc dataframe to a parquet file
rawccdf.to_parquet('rawccdata.parquet')

We repeat the process with the 'meds_' group.

In [75]:
# creating the meds dataframe
rawmedsdf = rawdfparquet.loc[:, 'meds_analgesicandantihistaminecombination' : 'meds_vitamins']

In [76]:
# adding the 'esi' column
rawmedsdf['esi'] = rawdfparquet['esi']

In [77]:
# saving the meds dataframe to a parquet file
rawmedsdf.to_parquet('../data/raw/rawmedsdata.parquet')

The next group will be 'medical readings'. These will not include the triage or vital readings, which will form separate groups.

In [80]:
# creating the readings dataframe
rawreadingsdf = rawdfparquet.loc[:, 'absolutelymphocytecount_last' : 'urineculture,routine_count']

In [82]:
# adding 'esi' column
rawreadingsdf['esi'] = rawdfparquet['esi']

In [84]:
# saving the readings dataframe to a parquet file
rawreadingsdf.to_parquet('../data/raw/rawreadingsdata.parquet')

The next group will be triage readings.

In [86]:
# creating the gtriage dataframe
rawtriagedf = rawdfparquet.loc[:, 'triage_vital_hr' : 'triage_vital_temp']

In [87]:
# adding 'esi' column
rawtriagedf['esi'] = rawdfparquet['esi']

In [90]:
# saving triage data to a parquet file
rawtriagedf.to_parquet('../data/raw/rawtriagedata.parquet')

The next group will be vitals readings (not including triage readings).

In [92]:
# creating the vitals dataframe
rawvitalsdf = rawdfparquet.loc[:, 'pulse_last' : 'o2_device_median']

In [93]:
# adding the 'esi' column
rawvitalsdf['esi'] = rawdfparquet['esi']

In [95]:
# saving vitals data to parquest file
rawvitalsdf.to_parquet('../data/raw/rawvitalsdata.parquet')

The next group will be previous scans. This will include number of surguries, so as not to leave it in a group of its own. 

In [97]:
# creating the dataframe
rawscansdf = rawdfparquet.loc[:, 'cxr_count' : 'otherxr_count']

In [100]:
# adding the 'esi' and 'n_surgeries' columns
rawscansdf['n_surgeries'] = rawdfparquet['n_surgeries']
rawscansdf['esi'] = rawdfparquet['esi']

In [101]:
# saving the scans data as a parquet file
rawscansdf.to_parquet('../data/raw/rawscansdata.parquet')

The next group will be conditions.

In [107]:
# creating the dataframe
rawconditionsdf = rawdfparquet.loc[:, '2ndarymalig' : 'whtblooddx']

In [108]:
# adding 'esi' column
rawconditionsdf['esi'] = rawdfparquet['esi']

In [110]:
# saving the conditions data as a parquet file
rawconditionsdf.to_parquet('../data/raw/rawconditionsdata.parquet')

The final group we will call info, and will include information such as arrival time, insurance_status, and 'n_edvisits'.

In [112]:
# creaing the dataframe
rawotherdf = rawdfparquet.loc[:, 'insurance_status' : 'previousdispo']

In [113]:
# adding 'esi', 'n_edvisits' and 'n_admissions' columns
rawotherdf['esi'] = rawdfparquet['esi']
rawotherdf['n_edvisits'] = rawdfparquet['n_edvisits']
rawotherdf['n_admissions'] = rawdfparquet['n_admissions']

In [115]:
# saving the other data to a parquet file
rawotherdf.to_parquet('../data/raw/rawotherdata.parquet')

We will check that we have accounted for all columns by summing the columns of each of our dataframes.

In [118]:
rawotherdf.shape[1] + rawconditionsdf.shape[1] + rawdemdf.shape[1] + rawmedsdf.shape[1] + rawreadingsdf.shape[1] + rawccdf.shape[1] + rawscansdf.shape[1] + rawtriagedf.shape[1] + rawvitalsdf.shape[1]

980

This looks good! we had 972 columns in the inital dataframe, which we have split into 9 dataframes. There is an 'esi' column in all new dataframes, accounting for the extra 8 total columns in all new dataframes. 