# Hospital Stay Length Prediction
Given medical data about different countries from 1990-2018, predict a patients <b>average hospital stay</b>.

## Getting Started

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [2]:
data = pd.read_csv('../input/healthcare-investments-and-length-of-hospital-stay/Healthcare_Investments_and_Hospital_Stay (1).csv')

In [3]:
data

Unnamed: 0,Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
0,AUS,1992,6.6,1.43,16.71,1.43
1,AUS,1994,6.4,2.36,18.48,2.36
2,AUS,1995,6.5,2.89,20.55,2.89
3,AUS,1996,6.4,2.96,21.95,2.96
4,AUS,1997,6.2,3.53,23.34,3.53
...,...,...,...,...,...,...
513,LTU,2014,6.8,10.57,22.17,10.57
514,LTU,2015,6.6,11.02,21.00,11.02
515,LTU,2016,6.6,12.20,23.01,12.20
516,LTU,2017,6.5,12.37,23.33,12.37


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518 entries, 0 to 517
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       518 non-null    object 
 1   Time           518 non-null    int64  
 2   Hospital_Stay  518 non-null    float64
 3   MRI_Units      518 non-null    float64
 4   CT_Scanners    518 non-null    float64
 5   Hospital_Beds  518 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 24.4+ KB


No NULLs present in the data. <br>
Encoding needed only for Location column.

## Preprocessing

In [5]:
data['Location'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'GRC', 'HUN', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'NLD', 'NZL',
       'POL', 'PRT', 'SVK', 'ESP', 'TUR', 'GBR', 'USA', 'EST', 'ISR',
       'RUS', 'SVN', 'ISL', 'LVA', 'LTU'], dtype=object)

In [6]:
def onehot_encode(df, column):
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)
    return df

In [35]:
def preprocess_inputs(df):
    # One-Hot encoding to Location column
    df = onehot_encode(df, 'Location')
    
    # Splitting data into X and y
    y = df['Hospital_Stay']
    X = df.drop('Hospital_Stay', axis=1)
    
    # Train Test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [36]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

## Training

In [37]:
models = {
    "                     Linear Regression" : LinearRegression(),
    "                   K-Nearest Neighbors" : KNeighborsRegressor(),
    "                        Neural Network" : MLPRegressor(),
    "Support Vector Machine (Linear Kernel)" : LinearSVR(),
    "    Support Vector Machine(RBF Kernel)" : SVR(),
    "                         Decision Tree" : DecisionTreeRegressor(),
    "                         Random Forest" : RandomForestRegressor(),
    "                     Gradient Boosting" : GradientBoostingRegressor(),
    "                               XGBoost" : XGBRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained')

                     Linear Regression trained
                   K-Nearest Neighbors trained




                        Neural Network trained
Support Vector Machine (Linear Kernel) trained
    Support Vector Machine(RBF Kernel) trained
                         Decision Tree trained
                         Random Forest trained
                     Gradient Boosting trained
                               XGBoost trained


## Results

In [38]:
for name, model in models.items():
    print(name + " R^2 score: {:.5f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2 score: 0.88893
                   K-Nearest Neighbors R^2 score: 0.89832
                        Neural Network R^2 score: 0.90858
Support Vector Machine (Linear Kernel) R^2 score: 0.85063
    Support Vector Machine(RBF Kernel) R^2 score: 0.60437
                         Decision Tree R^2 score: 0.86420
                         Random Forest R^2 score: 0.94979
                     Gradient Boosting R^2 score: 0.94889
                               XGBoost R^2 score: 0.94300
