# Exploratory data analysis

In [48]:
# Import libraries
import pandas as pd
import numpy as np

In [49]:
data  = pd.read_csv('data/D1.csv', low_memory=False)

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50031 entries, 0 to 50030
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id              50031 non-null  int64 
 1   patient_nbr               50031 non-null  int64 
 2   race                      50031 non-null  object
 3   gender                    50031 non-null  object
 4   age                       50031 non-null  object
 5   weight                    50031 non-null  object
 6   admission_type_id         50031 non-null  int64 
 7   discharge_disposition_id  50031 non-null  int64 
 8   admission_source_id       50031 non-null  int64 
 9   length_of_stay            50031 non-null  int64 
 10  payer_code                50031 non-null  object
 11  medical_specialty         50031 non-null  object
 12  num_lab_procedures        50031 non-null  int64 
 13  num_procedures            50031 non-null  int64 
 14  num_medications       

In [59]:
def value_counts(df):
    """
    Count the occurrences of each unique value in the object columns of a DataFrame.
    Present proportions of each unique value.

    :param df: A pandas DataFrame object.
    :return: None
    """
    for column in df.columns:
        if df[column].dtype == 'object':
            print(df[column].value_counts(normalize=True))
            print('-' * 50)


In [60]:
value_counts(data)

race
Caucasian          0.714197
AfricanAmerican    0.222842
?                  0.025124
Hispanic           0.020387
Other              0.012192
Asian              0.005257
Name: proportion, dtype: float64
--------------------------------------------------
gender
Female             0.539665
Male               0.460315
Unknown/Invalid    0.000020
Name: proportion, dtype: float64
--------------------------------------------------
age
[70-80)     0.262018
[60-70)     0.217345
[50-60)     0.175391
[80-90)     0.150507
[40-50)     0.101217
[30-40)     0.041035
[90-100)    0.023545
[20-30)     0.016830
[10-20)     0.009354
[0-10)      0.002758
Name: proportion, dtype: float64
--------------------------------------------------
weight
?            0.962783
[75-100)     0.015870
[50-75)      0.010993
[100-125)    0.006316
[125-150)    0.001459
[25-50)      0.001359
[0-25)       0.000740
[150-175)    0.000340
[175-200)    0.000120
>200         0.000020
Name: proportion, dtype: float64
----------

In [53]:
def replace_question_mark(df, columns):
    """
    Replace '?' values in the specified columns of a DataFrame with NaN, and convert the columns to numeric.

    :param df: A pandas DataFrame object.
    :param columns: A list of column names.
    :return: None
    """
    for column in columns:
        df[column] = df[column].replace('?', np.nan)
        df[column] = pd.to_numeric(df[column], errors='coerce')



In [54]:
data[['number_outpatient', 'number_inpatient', 'number_emergency']] = replace_question_mark(data, ['number_outpatient', 'number_inpatient', 'number_emergency'])

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50031 entries, 0 to 50030
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id              50031 non-null  int64 
 1   patient_nbr               50031 non-null  int64 
 2   race                      50031 non-null  object
 3   gender                    50031 non-null  object
 4   age                       50031 non-null  object
 5   weight                    50031 non-null  object
 6   admission_type_id         50031 non-null  int64 
 7   discharge_disposition_id  50031 non-null  int64 
 8   admission_source_id       50031 non-null  int64 
 9   length_of_stay            50031 non-null  int64 
 10  payer_code                50031 non-null  object
 11  medical_specialty         50031 non-null  object
 12  num_lab_procedures        50031 non-null  int64 
 13  num_procedures            50031 non-null  int64 
 14  num_medications       

In [58]:
data['weight'].value_counts(normalize=True)

weight
?            0.962783
[75-100)     0.015870
[50-75)      0.010993
[100-125)    0.006316
[125-150)    0.001459
[25-50)      0.001359
[0-25)       0.000740
[150-175)    0.000340
[175-200)    0.000120
>200         0.000020
Name: proportion, dtype: float64

## Comments about the data types

- The `id` columns are integers, which is fine.
- The `race` and `gender` look fine, just some missing values. We should convert them to categorical data type.
- The `age` column is a string based on the formatting of the intervals. We should convert it to an interval data type.
- The `weight` column has 97% missing values. I suggest drop this column.
- The `payer_code` column has 40% missing values. We should discuss if it is necessary to keep this column. We might be able to assume that the emply payer code means that the patient does not have insurance. If not, I suggest drop this column.
- 
- 

## Comments about the goal of the data mining

This looks like a 'length of stay' prediction problem. The goal is to predict the length of stay of a patient in the hospital. The `length_of_stay` column is the target variable. It has no missing values and the data are in a manageable range. We should convert this column to a numeric data type. 

The `readmitted` column could be secondary target variable. It is a categorical variable with three classes. We should convert this column to a categorical data type.

The `discharge_disposition_id` could also be used as a secondary target variable. It is a categorical variable with 26 classes. It might be worth reducing the number of classes to binary outcome variable (all cause mortality), or categorical variable with fewer classes (e.g. discharged home, discharged to another facility, died.).

We should discuss if we want to filter out the `admission_type_id` column. If we choose length of stay as the target variable, we might want to filter out the `admission_type_id` column to exclude newborns and electives. The same goes for `single_day_admission`. We might want to filter out the single day admissions. 

In [None]:
%%sql


In [61]:
data['length_of_stay'].value_counts(normalize=True)

length_of_stay
3     0.166677
2     0.164018
4     0.134497
1     0.133717
5     0.097220
6     0.076233
7     0.058764
8     0.047231
9     0.032060
10    0.026364
11    0.020947
12    0.016770
13    0.013252
14    0.012252
Name: proportion, dtype: float64

In [62]:
data['admission_type_id'].value_counts(normalize=True)

admission_type_id
1    0.484999
2    0.194180
3    0.167816
6    0.083628
5    0.066099
8    0.003098
4    0.000140
7    0.000040
Name: proportion, dtype: float64