In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/healthcare-dataset/healthcare_dataset.csv


## **Load the Data**

Load the data and print information about the data, summary statistics, and the first few rows.

In [2]:
healthcare_file_path = "/kaggle/input/healthcare-dataset/healthcare_dataset.csv"

# read in csv file with pandas
health_data = pd.read_csv(healthcare_file_path)

In [3]:
# print information about the dataframe
health_data.info()

# change column names to snake case
health_data.columns = (health_data.columns.str.replace(' ','_').str.lower())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [4]:
# print the top few rows of the dataframe
health_data.head()

Unnamed: 0,name,age,gender,blood_type,medical_condition,date_of_admission,doctor,hospital,insurance_provider,billing_amount,room_number,admission_type,discharge_date,medication,test_results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [5]:
# check if any columns have missing values
for col in health_data.columns:
    if health_data[col].isnull().any():
        print(str(col) + " has missing values")
    else:
        print(str(col) + " is good")

name is good
age is good
gender is good
blood_type is good
medical_condition is good
date_of_admission is good
doctor is good
hospital is good
insurance_provider is good
billing_amount is good
room_number is good
admission_type is good
discharge_date is good
medication is good
test_results is good


## Add a column for number of days admitted to the hospital

 Use the date of admission and date of discharge to determine the total number of days the patient was in the hospital.

In [6]:
# convert 'date_of_admission' and 'discharge_date' columns to datetime
dt_cols = ['date_of_admission','discharge_date']

for col in dt_cols:
    health_data[col] = pd.to_datetime(health_data[col], errors='coerce')

# subtract 'date_of_admission' from 'discharge_date' to get total number of admission days
health_data['length_of_admission'] = (health_data['discharge_date'] - health_data['date_of_admission']).dt.days

In [7]:
# print summary statistics for numerical data
health_data.describe()

Unnamed: 0,age,date_of_admission,billing_amount,room_number,discharge_date,length_of_admission
count,55500.0,55500,55500.0,55500.0,55500,55500.0
mean,51.539459,2021-11-01 01:02:22.443243008,25539.316097,301.134829,2021-11-16 13:15:20.821621504,15.509009
min,13.0,2019-05-08 00:00:00,-2008.49214,101.0,2019-05-09 00:00:00,1.0
25%,35.0,2020-07-28 00:00:00,13241.224652,202.0,2020-08-12 00:00:00,8.0
50%,52.0,2021-11-01 00:00:00,25538.069376,302.0,2021-11-17 00:00:00,15.0
75%,68.0,2023-02-03 00:00:00,37820.508436,401.0,2023-02-18 00:00:00,23.0
max,89.0,2024-05-07 00:00:00,52764.276736,500.0,2024-06-06 00:00:00,30.0
std,19.602454,,14211.454431,115.243069,,8.6596


In [8]:
# get list of categorical variables
s = (health_data.dtypes == 'object')
object_cols = list(s[s].index)
print('\nCategorical Columns:')
print(object_cols)

# check number of values for one-hot encoding categorical variables 
print('\nNumber of unique values for categorical data:')
print('Hospitals: ' + str(health_data['hospital'].nunique()))
print('Medical Conditions: ' + str(health_data['medical_condition'].nunique()))
print('Insurance Providers: ' + str(health_data['insurance_provider'].nunique()))
print('Medications: ' + str(health_data['medication'].nunique()))
print('Blood Types: ' + str(health_data['blood_type'].nunique()))
print('Admission Types: ' + str(health_data['admission_type'].nunique()))
print('Test Results: ' + str(health_data['test_results'].nunique()))
print('Gender: ' + str(health_data['gender'].nunique()))


Categorical Columns:
['name', 'gender', 'blood_type', 'medical_condition', 'doctor', 'hospital', 'insurance_provider', 'admission_type', 'medication', 'test_results']

Number of unique values for categorical data:
Hospitals: 39876
Medical Conditions: 6
Insurance Providers: 5
Medications: 5
Blood Types: 8
Admission Types: 3
Test Results: 3
Gender: 2


In [9]:
# set the target and predictor columns
y = health_data.length_of_admission
features = ['age','gender','blood_type','medical_condition','insurance_provider','admission_type','medication','test_results']
X = health_data[features]

## Subset data for training and validation

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# one-hot encode categorical data
t = (X.dtypes == 'object')
X_object_cols = list(t[t].index)
OH_encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
OH_array = OH_encoder.fit_transform(X[X_object_cols])
OH_cols = pd.DataFrame(OH_array, columns=OH_encoder.get_feature_names_out(X_object_cols))
OH_cols.index = X.index

# remove categorical columns, replace with one-hot encoding
num_health_data = X.drop(X_object_cols, axis=1)
OH_X = pd.concat([num_health_data, OH_cols], axis=1)
OH_X.columns = OH_X.columns.astype(str)
OH_X.head()

Unnamed: 0,age,gender_Female,gender_Male,blood_type_A+,blood_type_A-,blood_type_AB+,blood_type_AB-,blood_type_B+,blood_type_B-,blood_type_O+,...,admission_type_Emergency,admission_type_Urgent,medication_Aspirin,medication_Ibuprofen,medication_Lipitor,medication_Paracetamol,medication_Penicillin,test_results_Abnormal,test_results_Inconclusive,test_results_Normal
0,30,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,62,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,76,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,28,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,43,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [11]:
# divide dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(OH_X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

## Determine random forest model
Check MAE for a few different random forest models to determine which is best

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# define 5 different models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=150, random_state=0)
model_4 = RandomForestRegressor(n_estimators=150, min_samples_split=10, random_state=0)
model_5 = RandomForestRegressor(n_estimators=150 ,min_samples_split=10, max_depth=5, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

# function for determining MAE of each model to compare
def model_mae(model, X_t=X_train, X_v=X_val, y_t=y_train, y_v=y_val):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

# print MAE for each model
for i in range(0,len(models)):
    mae = model_mae(models[i])
    print("Model " + str(i+1) + " MAE: " + str(mae))

Model 1 MAE: 7.435956840411841
Model 2 MAE: 7.413688653426153
Model 3 MAE: 7.401712603642603
Model 4 MAE: 7.435006011486608
Model 5 MAE: 7.503760965843087
