# Exploratory data analysis

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
data  = pd.read_csv('data/D1.csv', low_memory=False)

In [5]:
data.head(10)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,length_of_stay,...,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,insulin,change,diabetesMed,readmitted,single_day_admission
0,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,...,No,No,Steady,No,No,Steady,Ch,Yes,NO,No
1,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,...,No,No,No,No,No,Steady,Ch,Yes,NO,No
2,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,No,Steady,No,No,Steady,Ch,Yes,NO,Yes
3,28236,89869032,AfricanAmerican,Female,[40-50),?,1,1,7,9,...,No,No,No,No,No,Steady,No,Yes,>30,No
4,35754,82637451,Caucasian,Male,[50-60),?,2,1,2,3,...,No,No,No,No,No,Steady,No,Yes,>30,No
5,36900,77391171,AfricanAmerican,Male,[60-70),?,2,1,4,7,...,No,No,No,Up,No,Steady,Ch,Yes,<30,No
6,40926,85504905,Caucasian,Female,[40-50),?,1,3,7,7,...,No,No,No,No,No,Down,Ch,Yes,<30,No
7,42570,77586282,Caucasian,Male,[80-90),?,1,6,7,10,...,No,No,No,No,No,Steady,No,Yes,NO,No
8,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,...,Steady,No,No,No,No,Steady,Ch,Yes,NO,No
9,62256,49726791,AfricanAmerican,Female,[60-70),?,3,1,2,1,...,No,No,No,No,No,Steady,No,Yes,>30,Yes


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50031 entries, 0 to 50030
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id              50031 non-null  int64 
 1   patient_nbr               50031 non-null  int64 
 2   race                      50031 non-null  object
 3   gender                    50031 non-null  object
 4   age                       50031 non-null  object
 5   weight                    50031 non-null  object
 6   admission_type_id         50031 non-null  int64 
 7   discharge_disposition_id  50031 non-null  int64 
 8   admission_source_id       50031 non-null  int64 
 9   length_of_stay            50031 non-null  int64 
 10  payer_code                50031 non-null  object
 11  medical_specialty         50031 non-null  object
 12  num_lab_procedures        50031 non-null  int64 
 13  num_procedures            50031 non-null  int64 
 14  num_medications       

In [7]:
def value_counts(df):
    """
    Count the occurrences of each unique value in the object columns of a DataFrame.
    Present proportions of each unique value.

    :param df: A pandas DataFrame object.
    :return: None
    """
    for column in df.columns:
        if df[column].dtype == 'object':
            print(df[column].value_counts())
            print('-' * 50)


In [8]:
value_counts(data)

race
Caucasian          35732
AfricanAmerican    11149
?                   1257
Hispanic            1020
Other                610
Asian                263
Name: count, dtype: int64
--------------------------------------------------
gender
Female             27000
Male               23030
Unknown/Invalid        1
Name: count, dtype: int64
--------------------------------------------------
age
[70-80)     13109
[60-70)     10874
[50-60)      8775
[80-90)      7530
[40-50)      5064
[30-40)      2053
[90-100)     1178
[20-30)       842
[10-20)       468
[0-10)        138
Name: count, dtype: int64
--------------------------------------------------
weight
?            48169
[75-100)       794
[50-75)        550
[100-125)      316
[125-150)       73
[25-50)         68
[0-25)          37
[150-175)       17
[175-200)        6
>200             1
Name: count, dtype: int64
--------------------------------------------------
payer_code
?     32665
MC     9405
HM     1573
BC     1492
UN     1256
SP 

In [9]:
def replace_question_mark(df, columns):
    """
    Replace '?' values in the specified columns of a DataFrame with NaN, and convert the columns to numeric.

    :param df: A pandas DataFrame object.
    :param columns: A list of column names.
    :return: None
    """
    for column in columns:
        df[column] = df[column].replace('?', np.nan)
        df[column] = pd.to_numeric(df[column], errors='coerce')



In [10]:
replace_question_mark(data, ['number_outpatient', 'number_inpatient', 'number_emergency'])

In [11]:
data[['number_outpatient', 'number_inpatient', 'number_emergency']]

Unnamed: 0,number_outpatient,number_inpatient,number_emergency
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
...,...,...,...
50026,0.0,0.0,0.0
50027,1.0,0.0,0.0
50028,0.0,0.0,0.0
50029,0.0,0.0,0.0


In [12]:
data['number_inpatient'].value_counts(normalize=True)

number_inpatient
0.0     0.685181
1.0     0.184441
2.0     0.068538
3.0     0.030150
4.0     0.014295
5.0     0.007138
6.0     0.004239
7.0     0.002419
8.0     0.001300
9.0     0.000860
10.0    0.000520
11.0    0.000300
12.0    0.000220
13.0    0.000100
16.0    0.000080
15.0    0.000080
14.0    0.000080
17.0    0.000020
21.0    0.000020
18.0    0.000020
Name: proportion, dtype: float64

In [13]:
data['number_outpatient'].value_counts(normalize=True)

number_outpatient
0.0     0.888944
1.0     0.062526
2.0     0.021815
3.0     0.012737
4.0     0.006699
5.0     0.003539
6.0     0.001340
7.0     0.000620
8.0     0.000560
9.0     0.000320
10.0    0.000240
11.0    0.000180
12.0    0.000100
13.0    0.000080
14.0    0.000080
16.0    0.000060
15.0    0.000040
20.0    0.000020
21.0    0.000020
35.0    0.000020
17.0    0.000020
29.0    0.000020
36.0    0.000020
Name: proportion, dtype: float64

In [14]:
data['number_emergency'].value_counts(normalize=True)

number_emergency
0.0     0.924403
1.0     0.052620
2.0     0.012890
3.0     0.004644
4.0     0.002642
5.0     0.000901
6.0     0.000540
7.0     0.000480
8.0     0.000320
9.0     0.000200
10.0    0.000160
11.0    0.000060
22.0    0.000040
25.0    0.000020
13.0    0.000020
42.0    0.000020
16.0    0.000020
28.0    0.000020
Name: proportion, dtype: float64

In [None]:
data['weight'].value_counts(normalize=True)

## Comments about the data types

- The `id` columns are integers, which is fine.
- The `race` and `gender` look fine, just some missing values. We should convert them to categorical data type.
- The `age` column is a string based on the formatting of the intervals. We should convert it to an interval data type.
- The `weight` column has 97% missing values. I suggest drop this column.
- The `payer_code` column has 40% missing values. We should discuss if it is necessary to keep this column. We might be able to assume that the emply payer code means that the patient does not have insurance. If not, I suggest drop this column.
- 
- 

## Comments about the goal of the data mining

This looks like a 'length of stay' prediction problem. The goal is to predict the length of stay of a patient in the hospital. The `length_of_stay` column is the target variable. It has no missing values and the data are in a manageable range. We should convert this column to a numeric data type. 

The `readmitted` column could be secondary target variable. It is a categorical variable with three classes. We should convert this column to a categorical data type.

The `discharge_disposition_id` could also be used as a secondary target variable. It is a categorical variable with 26 classes. It might be worth reducing the number of classes to binary outcome variable (all cause mortality), or categorical variable with fewer classes (e.g. discharged home, discharged to another facility, died.).

We should discuss if we want to filter out the `admission_type_id` column. If we choose length of stay as the target variable, we might want to filter out the `admission_type_id` column to exclude newborns and electives. The same goes for `single_day_admission`. We might want to filter out the single day admissions. 

In [None]:
%%sql


In [None]:
data['length_of_stay'].value_counts(normalize=True)

In [None]:
data['admission_type_id'].value_counts(normalize=True)