## Predict the length of hospital stay for a patient

### Import the needed packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import Adam
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


### Load the dataset

In [2]:

# Load the dataset
df = pd.read_csv('hospital_stay_training_data.csv')


### Check for null values in our dataset and verify the type of each column

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3139

In [4]:
# Find columns with NaN values
columns_with_nans = df.columns[df.isna().any()].tolist()

# Print the names of columns with NaNs
print("Columns with NaNs:", columns_with_nans)

Columns with NaNs: ['Bed Grade', 'City_Code_Patient']


### Drop rows in the dataset with null values in any of the columns

In [5]:
df = df.dropna()
df.count()

case_id                              313793
Hospital_code                        313793
Hospital_type_code                   313793
City_Code_Hospital                   313793
Hospital_region_code                 313793
Available Extra Rooms in Hospital    313793
Department                           313793
Ward_Type                            313793
Ward_Facility_Code                   313793
Bed Grade                            313793
patientid                            313793
City_Code_Patient                    313793
Type of Admission                    313793
Severity of Illness                  313793
Visitors with Patient                313793
Age                                  313793
Admission_Deposit                    313793
Stay                                 313793
dtype: int64

In [6]:
# The column "Stay" is what we have to predict. Check the distribution of the 'Stay' column
class_distribution = df['Stay'].value_counts(normalize=True) * 100

print("Class Distribution (%):\n", class_distribution)

Class Distribution (%):
 Stay
21-30                 27.507306
11-20                 24.568744
31-40                 17.308225
51-60                 10.982718
0-10                   7.409343
41-50                  3.677902
71-80                  3.217408
More than 100 Days     2.086726
81-90                  1.517242
91-100                 0.864583
61-70                  0.859802
Name: proportion, dtype: float64


### Identify the categorical and numerical columns and transform them in preparation for training

In [7]:

# Encode categorical variables and normalize numerical variables
categorical_columns = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade', 'City_Code_Patient','Type of Admission', 'Severity of Illness', 'Age']
numerical_columns = ['Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit']

# Encoding and Normalizing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
        #('cat', OrdinalEncoder(handle_unknown='use_encoded_value'), categorical_columns)

    ])

# Preparing target
label_encoder = LabelEncoder()
df['Stay'] = label_encoder.fit_transform(df['Stay'])

# Splitting the dataset
X = df.drop(['patientid','Stay'], axis=1)
y = df['Stay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing: create a model based on the dataset and then transform the dataset based on the created model
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray().astype(np.float32))
y_train_tensor = torch.tensor(y_train.values.astype(np.int64))
X_test_tensor = torch.tensor(X_test.toarray().astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.int64))

# Prepare DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


### Define our neural network class

In [8]:
class HospitalStayNet(nn.Module):
    def __init__(self, num_features, num_classes):
        super(HospitalStayNet, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


### Instantiate our neural network

In [9]:
# Determine the number of features from the column count of the dataset
num_features = X_train.shape[1]
num_classes = 11  # As mentioned, there are 11 classes
# Use GPU, if it is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
model = HospitalStayNet(num_features, num_classes).to(device)


Using device: cuda


### Define the function that calculates our trained model's accuracy

In [10]:
def calculate_accuracy(loader):
    model.eval()  # Set the model to evaluation mode. 

    correct = 0  # the number of correct predictions.
    total = 0  # the total number of predictions.

    with torch.no_grad():  # Disable the gradient calculation to save memory and speed up the process since gradients are not needed for evaluation.
        # Iterate over the data loader, which provides batches of inputs and their corresponding targets.
        for inputs, targets in loader:  
            inputs, targets = inputs.to(device), targets.to(device)  

            outputs = model(inputs)  # Compute the model's outputs for the given inputs.

            # Find the predicted class with the highest score for each input. 
            # The `torch.max` function returns both the maximum values and their indices (the predicted classes)
            _, predicted = torch.max(outputs.data, 1)  
            # targets.size(0) gives the number of targets in the batch.
            total += targets.size(0)  
            # Calculate the number of correct predictions in the batch by comparing predicted with targets, summing the true predictions, and adding this sum to the correct counter.
            correct += (predicted == targets).sum().item()  

    return 100 * correct / total  


### Define the loss function and the optimzer to use

In [11]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

### Train the model

In [12]:

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()       # Set the model to training mode. 
    # Iterate over the data loader, which provides batches of inputs and their corresponding targets.
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad() # initialize the gradients for this batch of data
        outputs = model(inputs)
        loss = criterion(outputs, targets) # calculate the losses
        loss.backward()  # compute the gradients based on the loss values
        optimizer.step() # update the weights and biases based on the loss values
    
    train_accuracy = calculate_accuracy(train_loader)
    test_accuracy = calculate_accuracy(test_loader)
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%')


Epoch 1, Loss: 1.4275, Train Accuracy: 42.32%, Test Accuracy: 42.63%
Epoch 2, Loss: 1.3981, Train Accuracy: 42.72%, Test Accuracy: 42.68%
Epoch 3, Loss: 1.3693, Train Accuracy: 43.10%, Test Accuracy: 42.98%
Epoch 4, Loss: 1.3418, Train Accuracy: 43.15%, Test Accuracy: 42.97%
Epoch 5, Loss: 1.5585, Train Accuracy: 43.10%, Test Accuracy: 42.84%
Epoch 6, Loss: 1.5117, Train Accuracy: 43.39%, Test Accuracy: 42.96%
Epoch 7, Loss: 1.4212, Train Accuracy: 43.49%, Test Accuracy: 43.11%
Epoch 8, Loss: 1.8734, Train Accuracy: 43.50%, Test Accuracy: 43.19%
Epoch 9, Loss: 1.8632, Train Accuracy: 43.59%, Test Accuracy: 42.96%
Epoch 10, Loss: 1.4188, Train Accuracy: 43.69%, Test Accuracy: 42.79%
Epoch 11, Loss: 1.3809, Train Accuracy: 43.82%, Test Accuracy: 42.90%
Epoch 12, Loss: 1.4405, Train Accuracy: 43.64%, Test Accuracy: 42.83%
Epoch 13, Loss: 1.6338, Train Accuracy: 44.03%, Test Accuracy: 42.81%
Epoch 14, Loss: 1.5955, Train Accuracy: 43.95%, Test Accuracy: 42.67%
Epoch 15, Loss: 1.2276, Train

#### The difference between training and test accuracy is 1.96
##### Epoch 20, Loss: 1.4211, Train Accuracy: 44.40%, Test Accuracy: 42.44%

### Save the trained model

In [13]:

# Save the model
torch.save(model.state_dict(), 'hospital_stay_model.pth')


### Define a newer model with more layers and nodes

In [14]:
class HospitalStayNet256(nn.Module):
    def __init__(self, num_features, num_classes):
        super(HospitalStayNet256, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x
model = HospitalStayNet256(num_features, num_classes).to(device)
optimizer = Adam(model.parameters(), lr=0.001)

### Train the new model

In [15]:
# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    train_accuracy = calculate_accuracy(train_loader)
    test_accuracy = calculate_accuracy(test_loader)
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy:.4f}%, Test Accuracy: {test_accuracy:.4f}%')


Epoch 1, Loss: 1.7144, Train Accuracy: 42.5524%, Test Accuracy: 42.6552%
Epoch 2, Loss: 1.2402, Train Accuracy: 42.5755%, Test Accuracy: 42.7333%
Epoch 3, Loss: 1.7224, Train Accuracy: 43.0081%, Test Accuracy: 43.0424%
Epoch 4, Loss: 1.9438, Train Accuracy: 42.9468%, Test Accuracy: 42.7381%
Epoch 5, Loss: 1.5798, Train Accuracy: 43.2157%, Test Accuracy: 43.0042%
Epoch 6, Loss: 1.5262, Train Accuracy: 43.3858%, Test Accuracy: 42.7859%
Epoch 7, Loss: 1.3699, Train Accuracy: 43.4344%, Test Accuracy: 42.6138%
Epoch 8, Loss: 1.6594, Train Accuracy: 43.8857%, Test Accuracy: 43.0217%
Epoch 9, Loss: 1.2425, Train Accuracy: 43.9717%, Test Accuracy: 42.9006%
Epoch 10, Loss: 1.6484, Train Accuracy: 44.1629%, Test Accuracy: 43.0090%
Epoch 11, Loss: 1.1459, Train Accuracy: 44.2319%, Test Accuracy: 42.7731%
Epoch 12, Loss: 1.7826, Train Accuracy: 44.5346%, Test Accuracy: 42.5533%
Epoch 13, Loss: 1.7301, Train Accuracy: 45.1090%, Test Accuracy: 42.7158%
Epoch 14, Loss: 1.6124, Train Accuracy: 44.9724

#### Note that the more complex model achieves better training accuracy, but the difference the training and test accuracies has increased
##### Epoch 20, Loss: 1.3692, Train Accuracy: 46.4993%, Test Accuracy: 41.6036%

### Define another model with dropout layers

In [16]:
class HospitalStayNetWithDropout(nn.Module):
    def __init__(self, num_features, num_classes):
        super(HospitalStayNetWithDropout, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.dropout1 = nn.Dropout(0.5)  # Dropout layer with 50% probability
        self.fc2 = nn.Linear(128, 64)
        #self.dropout2 = nn.Dropout(0.5)  # Another Dropout layer
        self.fc3 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        # x = self.dropout2(x)
        x = self.fc3(x)
        return x

model = HospitalStayNetWithDropout(num_features, num_classes).to(device)
optimizer = Adam(model.parameters(), lr=0.001)

#### Train the model with the dropout layers

In [17]:
# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    train_accuracy = calculate_accuracy(train_loader)
    test_accuracy = calculate_accuracy(test_loader)
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%')


Epoch 1, Loss: 1.4371, Train Accuracy: 41.77%, Test Accuracy: 42.19%
Epoch 2, Loss: 1.5844, Train Accuracy: 42.20%, Test Accuracy: 42.49%
Epoch 3, Loss: 1.9316, Train Accuracy: 42.41%, Test Accuracy: 42.67%
Epoch 4, Loss: 1.5281, Train Accuracy: 42.65%, Test Accuracy: 42.91%
Epoch 5, Loss: 1.6003, Train Accuracy: 42.65%, Test Accuracy: 42.88%
Epoch 6, Loss: 1.5556, Train Accuracy: 42.86%, Test Accuracy: 42.95%
Epoch 7, Loss: 1.2876, Train Accuracy: 42.88%, Test Accuracy: 42.98%
Epoch 8, Loss: 1.4784, Train Accuracy: 42.75%, Test Accuracy: 42.77%
Epoch 9, Loss: 1.6877, Train Accuracy: 42.94%, Test Accuracy: 43.01%
Epoch 10, Loss: 1.6729, Train Accuracy: 42.88%, Test Accuracy: 42.97%
Epoch 11, Loss: 1.5647, Train Accuracy: 42.80%, Test Accuracy: 42.99%
Epoch 12, Loss: 1.6370, Train Accuracy: 43.02%, Test Accuracy: 43.07%
Epoch 13, Loss: 1.6155, Train Accuracy: 42.93%, Test Accuracy: 42.97%
Epoch 14, Loss: 1.2246, Train Accuracy: 43.17%, Test Accuracy: 43.10%
Epoch 15, Loss: 1.4217, Train

#### Note that although the model with dropout layers has lesser training accuracy compare to the 5 layer model created earlier, its test accuracy is better and the difference between training and test accuracy is only 1.17
##### Epoch 20, Loss: 1.4657, Train Accuracy: 43.16%, Test Accuracy: 42.99%

### Define another model with Batch Normalization. 

In [18]:
class HospitalStayNetWithBN(nn.Module):
    def __init__(self, num_features, num_classes):
        super(HospitalStayNetWithBN, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x


model = HospitalStayNetWithBN(num_features, num_classes).to(device)
optimizer = Adam(model.parameters(), lr=0.001)

#### Train the model with batch normalization

In [19]:
# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    train_accuracy = calculate_accuracy(train_loader)
    test_accuracy = calculate_accuracy(test_loader)
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%')


Epoch 1, Loss: 1.6159, Train Accuracy: 41.81%, Test Accuracy: 42.04%
Epoch 2, Loss: 1.4692, Train Accuracy: 42.60%, Test Accuracy: 42.60%
Epoch 3, Loss: 1.6270, Train Accuracy: 42.81%, Test Accuracy: 42.78%
Epoch 4, Loss: 1.7667, Train Accuracy: 43.06%, Test Accuracy: 42.88%
Epoch 5, Loss: 1.5386, Train Accuracy: 43.26%, Test Accuracy: 42.93%
Epoch 6, Loss: 1.7066, Train Accuracy: 43.36%, Test Accuracy: 42.89%
Epoch 7, Loss: 1.3127, Train Accuracy: 43.36%, Test Accuracy: 43.00%
Epoch 8, Loss: 1.2792, Train Accuracy: 43.53%, Test Accuracy: 42.99%
Epoch 9, Loss: 1.1534, Train Accuracy: 43.61%, Test Accuracy: 43.23%
Epoch 10, Loss: 1.4579, Train Accuracy: 43.57%, Test Accuracy: 42.91%
Epoch 11, Loss: 1.6568, Train Accuracy: 43.81%, Test Accuracy: 43.09%
Epoch 12, Loss: 1.8723, Train Accuracy: 43.62%, Test Accuracy: 42.92%
Epoch 13, Loss: 1.4746, Train Accuracy: 43.70%, Test Accuracy: 42.89%
Epoch 14, Loss: 1.5525, Train Accuracy: 43.84%, Test Accuracy: 42.84%
Epoch 15, Loss: 1.5841, Train

##### Since our problem and thus the model is not highly complex, we do not really need batch normalization in this case. So the performance with and without batch normalization is similar