1. Data Preprocessing
prepare the data for modeling, including dropping unnecessary columns, encoding categorical variables, and splitting the data.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Load your dataset
df = pd.read_csv('modified_output.csv')  # Replace 'your_dataset.csv' with the path to your dataset
#print(df.dtypes)
print(df.head())
#print(f"{col}:\nUnique values: {df[col].unique()}\n")
categorical_features = ['GENDER', 'RURAL(R) /Urban(U)', 'TYPE OF ADMISSION-EMERGENCY/OPD', 'OUTCOME']
# Drop unnecessary columns
df = df.drop(['Serial Number', 'Admission Number', 'Date of Admission', 'Date of Discharge', 'month year'], axis=1)

# Separate categorical and non-categorical columns
non_categorical_df = df.drop(columns=categorical_features)
categorical_df = df[categorical_features]

# Encode categorical variables
one_hot = OneHotEncoder()
one_hot_df = pd.DataFrame(one_hot.fit_transform(categorical_df).toarray(), columns=one_hot.get_feature_names_out(categorical_features))

# Concatenate the one-hot encoded columns to non-categorical columns
transformed_df = pd.concat([non_categorical_df, one_hot_df], axis=1)


   Serial Number Admission Number Date of Admission Date of Discharge  AGE  \
0              1           234735          4/1/2017          4/3/2017   81   
1              2           234696          4/1/2017          4/5/2017   65   
2              3           234882          4/1/2017          4/3/2017   53   
3              4           234635          4/1/2017          4/8/2017   67   
4              5           234486          4/1/2017         4/23/2017   60   

  GENDER RURAL(R) /Urban(U) TYPE OF ADMISSION-EMERGENCY/OPD month year  \
0      M                  R                               E     Apr-17   
1      M                  R                               E     Apr-17   
2      M                  U                               E     Apr-17   
3      F                  U                               E     Apr-17   
4      F                  U                               E     Apr-17   

   DURATION OF STAY  ...  Congenital Heart Disease Urinary tract infection  \
0       

In [3]:
transformed_df.columns
print(transformed_df.dtypes)


AGE                                               int64
DURATION OF STAY                                  int64
duration of intensive unit stay                   int64
SMOKING                                            bool
ALCOHOL                                            bool
Diabetes Mellitus                                  bool
Hypertension                                       bool
Coronary Artery Disease                            bool
CARDIOMYOPATHY                                     bool
CHRONIC KIDNEY DISEASE                             bool
Haemoglobin                                     float64
TOTAL LEUKOCYTES COUNT                          float64
PLATELETS                                       float64
GLUCOSE                                         float64
UREA                                            float64
CREATININE                                      float64
B-TYPE NATRIURETIC PEPTIDE                      float64
RAISED CARDIAC ENZYMES                          

Remove rows where 'one_hot__OUTCOME_DAMA' is present, you can do so by using a boolean mask. and delate the data 

In [4]:
transformed_df = transformed_df[transformed_df['OUTCOME_DAMA'] == 0]
transformed_df = transformed_df.drop(['OUTCOME_DAMA'], axis=1)
transformed_df = transformed_df.dropna()

In [5]:
transformed_df.columns = transformed_df.columns.str.replace(' ', '_')
transformed_df.columns = transformed_df.columns.str.lower()
print(transformed_df.dtypes)
transformed_df = transformed_df.rename(columns={'rural(r)_/urban(u)_u': 'region_u', 'type_of_admission-emergency/opd_e': 'type_of_admission_e', 'outcome_expiry': 'outcome_e'})
transformed_df.drop(['gender_f', 'rural(r)_/urban(u)_r', 'type_of_admission-emergency/opd_o', 'outcome_discharge'], axis=1, inplace=True)
print(transformed_df.dtypes)

age                                               int64
duration_of_stay                                  int64
duration_of_intensive_unit_stay                   int64
smoking                                            bool
alcohol                                            bool
diabetes_mellitus                                  bool
hypertension                                       bool
coronary_artery_disease                            bool
cardiomyopathy                                     bool
chronic_kidney_disease                             bool
haemoglobin                                     float64
total_leukocytes_count                          float64
platelets                                       float64
glucose                                         float64
urea                                            float64
creatinine                                      float64
b-type_natriuretic_peptide                      float64
raised_cardiac_enzymes                          

In [6]:
for col in ['gender_m', 'region_u', 'type_of_admission_e', 'outcome_e']:
    transformed_df[col] = transformed_df[col].astype('int64')
    
print(transformed_df.dtypes)

age                                               int64
duration_of_stay                                  int64
duration_of_intensive_unit_stay                   int64
smoking                                            bool
alcohol                                            bool
diabetes_mellitus                                  bool
hypertension                                       bool
coronary_artery_disease                            bool
cardiomyopathy                                     bool
chronic_kidney_disease                             bool
haemoglobin                                     float64
total_leukocytes_count                          float64
platelets                                       float64
glucose                                         float64
urea                                            float64
creatinine                                      float64
b-type_natriuretic_peptide                      float64
raised_cardiac_enzymes                          

In [7]:
transformed_df.to_csv('clean_data_hospital.csv', index=False)

In [8]:
print(transformed_df.columns)

Index(['age', 'duration_of_stay', 'duration_of_intensive_unit_stay', 'smoking',
       'alcohol', 'diabetes_mellitus', 'hypertension',
       'coronary_artery_disease', 'cardiomyopathy', 'chronic_kidney_disease',
       'haemoglobin', 'total_leukocytes_count', 'platelets', 'glucose', 'urea',
       'creatinine', 'b-type_natriuretic_peptide', 'raised_cardiac_enzymes',
       'ejection_fraction', 'severe_anaemia', 'anaemia', 'stable_angina',
       'acute_coronary_syndrome', 'st_elevation_myocardial_infarction',
       'atypical_chest_pain', 'heart_failure',
       'heart_failure_with_reduced_ejection_fraction',
       'heart_failure_with_normal_ejection_fraction', 'valvular_heart_disease',
       'complete_heart_block', 'sick_sinus_syndrome', 'acute_kidney_injury',
       'cerebrovascular_accident_infract', 'cerebrovascular_accident_bleed',
       'atrial_fibrilation', 'ventricular_tachycardia',
       'paroxysmal_supra_ventricular_tachycardia', 'congenital_heart_disease',
       'urina

In [9]:
for col in transformed_df.columns:
    if transformed_df[col].dtype in ['int64', 'float64']:
        print(f"{col}:\nMin: {transformed_df[col].min()}\nMax: {transformed_df[col].max()}\n")
    else:
        print(f"{col}:\nUnique values: {transformed_df[col].unique()}\n")



age:
Min: 4
Max: 110

duration_of_stay:
Min: 1
Max: 98

duration_of_intensive_unit_stay:
Min: 0
Max: 58

smoking:
Unique values: [False  True]

alcohol:
Unique values: [False  True]

diabetes_mellitus:
Unique values: [ True False]

hypertension:
Unique values: [False  True]

coronary_artery_disease:
Unique values: [False  True]

cardiomyopathy:
Unique values: [False  True]

chronic_kidney_disease:
Unique values: [False  True]

haemoglobin:
Min: 3.0
Max: 22.0

total_leukocytes_count:
Min: 0.1
Max: 261.0

platelets:
Min: 1.38
Max: 1179.0

glucose:
Min: 0.0
Max: 888.0

urea:
Min: 0.1
Max: 479.0

creatinine:
Min: 0.065
Max: 15.63

b-type_natriuretic_peptide:
Min: 0.0
Max: 5000.0

raised_cardiac_enzymes:
Unique values: [ True False]

ejection_fraction:
Min: 0.0
Max: 60.0

severe_anaemia:
Unique values: [False  True]

anaemia:
Unique values: [ True False]

stable_angina:
Unique values: [False  True]

acute_coronary_syndrome:
Unique values: [ True False]

st_elevation_myocardial_infarction:
U

In [10]:
from sklearn.model_selection import train_test_split

# Split the data
if 'duration_of_stay' in transformed_df.columns and 'outcome_e' in transformed_df.columns:
    X = transformed_df.drop(['duration_of_stay', 'outcome_e'], axis=1)
    y_duration = transformed_df['duration_of_stay']
    X_train, X_temp, y_duration_train, y_duration_temp = train_test_split(X, y_duration, test_size=0.3, random_state=42)
    
    X_val, X_test, y_duration_val, y_duration_test = train_test_split(X_temp, y_duration_temp, test_size=0.5, random_state=42)

else:
    print("Columns 'duration_of_stay' and/or 'outcome_e' not found in the dataframe.")


Feature engineering 

In [11]:
# # Example of creating a new feature, e.g., combining AGE and intensive unit stay duration
# X_train['Age_IntensiveStay'] = X_train['AGE'] * X_train['duration of intensive unit stay']
# X_val['Age_IntensiveStay'] = X_val['AGE'] * X_val['duration of intensive unit stay']

# # You need to retrain your chosen regression model with the new feature set
# # Here's how you might retrain the Random Forest as an example
# regressor.fit(X_train, y_duration_train)
# y_pred_val = regressor.predict(X_val)
# mae_new_feature = mean_absolute_error(y_duration_val, y_pred_val)
# print(f'New Feature MAE: {mae_new_feature}')

3. Regression Model for Predicting Duration of Stay
We'll use a Random Forest Regressor as an example. You can apply similar steps for other regression models.


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Initialize the model
regressor = RandomForestRegressor(random_state=42)

# Train the model
regressor.fit(X_train, y_duration_train)

# Predict on validation set
y_pred_val = regressor.predict(X_val)

# Evaluate the model
mae = mean_absolute_error(y_duration_val, y_pred_val)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 1.781780831739962


In [13]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

# Initialize the base model
regressor = RandomForestRegressor(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40, 50],  # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider at every split
}

# Setup the random search with 4-fold cross validation
random_search = RandomizedSearchCV(estimator=regressor, param_distributions=param_grid, n_iter=100, cv=4, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_duration_train)

# Print the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", -random_search.best_score_)

# Predict on the validation set using the best found parameters
y_pred_val = random_search.predict(X_val)

# Evaluate the model
mae = mean_absolute_error(y_duration_val, y_pred_val)
print(f'Mean Absolute Error after fine-tuning: {mae}')


Fitting 4 folds for each of 100 candidates, totalling 400 fits


120 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
78 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\likhi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\likhi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\likhi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\L

Best Parameters: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30}
Best Score: -0.601078343905942
Mean Absolute Error after fine-tuning: 1.8380118088749051


Alternative Performance Metrics (RMSE and MSE)
To evaluate the model with Root Mean Squared Error (RMSE) and Mean Squared Error (MSE), we can use mean_squared_error from sklearn.metrics and then calculate RMSE from MSE.

In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate MSE
mse = mean_squared_error(y_duration_val, y_pred_val)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')


Mean Squared Error: 8.8322538017619
Root Mean Squared Error: 2.9719107997653462


Regularization with Lasso (L1) or Ridge (L2) Regression
Let's try Lasso for L1 regularization and Ridge for L2 regularization.

In [15]:
from sklearn.linear_model import Lasso, Ridge

# Lasso Regression (L1 Regularization)
lasso_regressor = Lasso(alpha=0.1, random_state=42)  # Adjust alpha as needed
lasso_regressor.fit(X_train, y_duration_train)
y_pred_val_lasso = lasso_regressor.predict(X_val)
mae_lasso = mean_absolute_error(y_duration_val, y_pred_val_lasso)
print(f'Lasso Regression MAE: {mae_lasso}')

# Ridge Regression (L2 Regularization)
ridge_regressor = Ridge(alpha=1, random_state=42)  # Adjust alpha as needed
ridge_regressor.fit(X_train, y_duration_train)
y_pred_val_ridge = ridge_regressor.predict(X_val)
mae_ridge = mean_absolute_error(y_duration_val, y_pred_val_ridge)
print(f'Ridge Regression MAE: {mae_ridge}')


Lasso Regression MAE: 1.9400628037891232
Ridge Regression MAE: 1.8738096251027374


Different Regression Algorithms
Let's implement Linear Regression and Gradient Boosting Regression as alternatives.

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

# Linear Regression
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_duration_train)
y_pred_val_linear = linear_regressor.predict(X_val)
mae_linear = mean_absolute_error(y_duration_val, y_pred_val_linear)
print(f'Linear Regression MAE: {mae_linear}')

# Gradient Boosting Regression
gbr_regressor = GradientBoostingRegressor(random_state=42)
gbr_regressor.fit(X_train, y_duration_train)
y_pred_val_gbr = gbr_regressor.predict(X_val)
mae_gbr = mean_absolute_error(y_duration_val, y_pred_val_gbr)
print(f'Gradient Boosting Regression MAE: {mae_gbr}')

Linear Regression MAE: 1.8741923990915783
Gradient Boosting Regression MAE: 1.8256202007915394


Using Grid Search CV 

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
gbr = GradientBoostingRegressor(random_state=42)

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 8],
    'min_samples_leaf': [10, 20, 30]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Fit to the training data
grid_search.fit(X_train, y_duration_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score (negative MAE):", grid_search.best_score_)


Best parameters: {'learning_rate': 0.1, 'max_depth': 8, 'min_samples_leaf': 20, 'n_estimators': 100}
Best score (negative MAE): -1.8583869361249519


Using RandomizedSearchCV

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.ensemble import GradientBoostingRegressor

# Define the model
gbr = GradientBoostingRegressor(random_state=42)

# Define the hyperparameter distribution
param_dist = {
    'learning_rate': uniform(0.01, 1),  # Continuous distribution from 0.01 to 1.01
    'n_estimators': randint(100, 400),  # Discrete uniform distribution from 100 to 400
    'max_depth': randint(3, 9),  # Discrete uniform distribution from 3 to 9
    'min_samples_leaf': randint(10, 31)  # Discrete uniform distribution from 10 to 31
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=gbr, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)

# Fit to the training data
random_search.fit(X_train, y_duration_train)

# Best parameters and best score
print("Best parameters:", random_search.best_params_)
print("Best score (negative MAE):", random_search.best_score_)


Best parameters: {'learning_rate': 0.0443885211152184, 'max_depth': 8, 'min_samples_leaf': 26, 'n_estimators': 149}
Best score (negative MAE): -1.8554564818840125


Classification Model for Categorizing Risk Levels

In [19]:
# Extracting the indices of rows that are not 'DAMA' (i.e., either 'DISCHARGE' or 'EXPIRY')
non_dama_indices = transformed_df[transformed_df['outcome_e'].isin([0, 1])].index

# Adjust features dataset to match the risk labels by selecting the same indices
X_risk = X.loc[non_dama_indices]

# Split the data for classification task
X_train_risk, X_temp_risk, y_risk_train, y_risk_temp = train_test_split(X_risk, transformed_df.loc[non_dama_indices, 'outcome_e'], test_size=0.3, random_state=42)
X_val_risk, X_test_risk, y_risk_val, y_risk_test = train_test_split(X_temp_risk, y_risk_temp, test_size=0.5, random_state=42)

# The rest of the classification model code remains the same as before
# Classification Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the model
classifier = RandomForestClassifier(random_state=42)

# Train the model
classifier.fit(X_train_risk, y_risk_train)

# Predict on validation set
y_pred_risk_val = classifier.predict(X_val_risk)

# Evaluate the model
accuracy = accuracy_score(y_risk_val, y_pred_risk_val)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9660611854684512


Saving the training model

In [None]:
import joblib

# Assuming 'regressor' is your duration of stay model and 'classifier' is your risk level classification model
joblib.dump(regressor, 'random_forest_regressor.joblib')
joblib.dump(classifier, 'risk_level_classification_model.joblib')