In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/healthcare-dataset/healthcare_dataset.csv


## **Load the Data**

Load the data and print information about the data, summary statistics, and the first few rows.

In [14]:
healthcare_file_path = "/kaggle/input/healthcare-dataset/healthcare_dataset.csv"

# read in csv file with pandas
health_data = pd.read_csv(healthcare_file_path)

# print information about the dataframe
print(health_data.info())

# change column names to snake case
health_data.columns = (health_data.columns.str.replace(' ','_').str.lower())

# print summary statistics for the numerical data
print(health_data.describe())

# print the top few rows of the dataframe
print(health_data.head())

# check if any columns are have missing values
for col in health_data.columns:
    if health_data[col].isnull().any():
        print(str(col) + " has missing values")
    else:
        print(str(col) + " is good")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

## Add a column for number of days admitted to the hospital

 Use the date of admission and date of discharge to determine the total number of days the patient was in the hospital.

In [36]:
# convert 'date_of_admission' and 'discharge_date' columns to datetime
dt_cols = ['date_of_admission','discharge_date']

for col in dt_cols:
    health_data[col] = pd.to_datetime(health_data[col], errors='coerce')

print(health_data.info())

# subtract 'date_of_admission' from 'discharge_date' to get total number of admission days
health_data['length_of_admission'] = (health_data['discharge_date'] - health_data['date_of_admission']).dt.days

print(health_data.head())

print("Hospitals: " + str(health_data['hospital'].nunique()))
print("Medical Conditions: " + str(health_data['medical_condition'].nunique()))
print("Insurance Providers: " + str(health_data['insurance_provider'].nunique()))
print("Medication: " + str(health_data['medication'].nunique()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   name                 55500 non-null  object        
 1   age                  55500 non-null  int64         
 2   gender               55500 non-null  object        
 3   blood_type           55500 non-null  object        
 4   medical_condition    55500 non-null  object        
 5   date_of_admission    55500 non-null  datetime64[ns]
 6   doctor               55500 non-null  object        
 7   hospital             55500 non-null  object        
 8   insurance_provider   55500 non-null  object        
 9   billing_amount       55500 non-null  float64       
 10  room_number          55500 non-null  int64         
 11  admission_type       55500 non-null  object        
 12  discharge_date       55500 non-null  datetime64[ns]
 13  medication           55500 non-

## Subset data for training and validation

In [44]:
from sklearn.model_selection import train_test_split

# set the target and predictor columns
y = health_data.length_of_admission
features = ['age','gender','blood_type','medical_condition','insurance_provider','admission_type','medication','test_results']

# one-hot encode categorical data
X = pd.get_dummies(health_data[features])

# divide dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

## Determine random forest model
Check MAE for a few different random forest models to determine which is best

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# define 5 different models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=150, random_state=0)
model_4 = RandomForestRegressor(n_estimators=150, min_samples_split=10, random_state=0)
model_5 = RandomForestRegressor(n_estimators=150 ,min_samples_split=10, max_depth=5, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

# function for determining MAE of each model to compare
def model_mae(model, X_t=X_train, X_v=X_val, y_t=y_train, y_v=y_val):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

# print MAE for each model
for i in range(0,len(models)):
    mae = model_mae(models[i])
    print("Model " + str(i+1) + " MAE: " + str(mae))

Model 1 MAE: 7.435956840411841
Model 2 MAE: 7.413688653426153
Model 3 MAE: 7.401712603642603
Model 4 MAE: 7.435006011486608
Model 5 MAE: 7.503760965843087
