<a href="https://colab.research.google.com/github/MrDadzie/Sepsis_Classification_Project/blob/master/Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro
## General

In this project, the CRISP-DM approach is explored to analyse a patient dataset and build several machine learning models to predict whether a patient will be diagnosed as Sepsis or not. The best model is exported and deployed as a web app using a FAST API


#Setup


#Installation


In [1]:

!pip install pandas_profiling
!pip install shap



#Importation

In [2]:
# Data handling
import pandas as pd
import numpy as np
# Vizualisation (Matplotlib, Plotly, Seaborn, etc. )
import seaborn as sns
import matplotlib.pyplot as plt
import os

# EDA (pandas-profiling, etc. )
from pandas_profiling import ProfileReport
from IPython.display import display

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import scipy.stats as stats
from scipy.stats import skew
from sklearn.pipeline import Pipeline

# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

# Evaluation Metrics
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, accuracy_score, roc_auc_score
import shap
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )
from sklearn.model_selection import GridSearchCV

# Other packages
import pickle
from joblib import dump
import warnings
warnings.filterwarnings('ignore')

  @nb.jit


ImportError: cannot import name 'DataError' from 'pandas.core.base' (C:\Users\KWABENABOATENG\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\base.py)

#Data Loading

In [None]:
train_data_url = 'https://raw.githubusercontent.com/MrDadzie/Sepsis_Classification_Project/master/Datasets/Patients_Files_Train.csv'
test_data_url = 'https://raw.githubusercontent.com/MrDadzie/Sepsis_Classification_Project/master/Datasets/Patients_Files_Test.csv'

In [None]:
train_df = pd.read_csv(train_data_url)
test_df = pd.read_csv(test_data_url)

In [None]:
print(train_df.head())
print(test_df.head())

#Exploratory Data Analysis : EDA

##Renaming Columns
Here, the columns are renamed to help in understanding the fields in the dataset.


In [None]:
new_column_names = {'PRG':'Plasma_Glucose',
               'PL': 'Blood_Work_Result1',
               'PR': 'Blood_Pressure',
               'SK': 'Blood_Work_Result2',
               'TS': 'Blood_Work_Result3',
               'M11': 'Body_mass_index',
               'BD2': 'Blood_Work_Result4'


}

train_df.rename(columns = new_column_names, inplace = True)
test_df.rename (columns = new_column_names, inplace = True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##Data Overview

In [None]:
train_df.info()

In [None]:
train_df.describe().T

##Hypothesis

**Null Hypothesis**: There is no significant difference in the likelihood of young and old patients developing sepssis.

**Alternate Hypothesis** : The likelihood of young patients developing sepssis differs significantly from that of old patients.

####Testing

In [None]:
#Older patients > 40
Older_pos = train_df[(train_df['Age']> 40) & (train_df['Sepssis']=='Positive')]
Older_neg = train_df[(train_df['Age']> 40) & (train_df['Sepssis']=='Negative')]
Old = [len(Older_pos), len(Older_neg)]

#Younger patients < 40
Young_pos = train_df[(train_df['Age']< 40) & (train_df['Sepssis']=='Positive')]
Young_neg = train_df[(train_df['Age']< 40) & (train_df['Sepssis']=='Negative')]
Young = [len(Young_pos), len(Young_neg)]


In [None]:
#Creating  contingency table
observed = np.array([Young, Old])

#Using the Chi-square test
chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed)

#Results
alpha = 0.05

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print("Contingency table of expected frequencies:")
print(expected)

if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in the likelihood of young and old patients developing sepsis.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in the likelihood of young and old patients developing sepsis.")



#Univariate Analysis

In [None]:
train_df.columns


##

In [None]:
col_names  = train_df.iloc[:,:-2].select_dtypes(include= np.number).columns
col_names

In [None]:
#Visualizing the distribution of the numerical columns using the KDE plot
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(8, 8))  # 2x4 grid

for i, col in enumerate(col_names):
    row_index = i // 2  # Calculate row index
    col_index = i % 2  # Calculate column index

    sns.kdeplot(data=train_df, x=col, ax=axes[row_index, col_index], fill=True)
    axes[row_index, col_index].set_title(f'Distribution of {col}')
    axes[row_index, col_index].set_xlabel(col)
    axes[row_index, col_index].set_ylabel('Density')

    mean_val = train_df[col].mean()
    skewness_val = train_df[col].skew()
    kurtosis_val = train_df[col].kurtosis()

    axes[row_index, col_index].text(0.6, 0.9, f'Mean: {mean_val:.2f}', transform=axes[row_index, col_index].transAxes)
    axes[row_index, col_index].text(0.6, 0.8, f'Skewness: {skewness_val:.2f}', transform=axes[row_index, col_index].transAxes)
    axes[row_index, col_index].text(0.6, 0.7, f'Kurtosis: {kurtosis_val:.2f}', transform=axes[row_index, col_index].transAxes)

    axes[row_index, col_index].axvline(mean_val, color='blue', linestyle='--', label='Mean')

    outliers = train_df[(train_df[col] > mean_val + 3 * train_df[col].std()) | (train_df[col] < mean_val - 3 * train_df[col].std())]
    axes[row_index, col_index].plot(outliers[col], [0] * len(outliers), 'ro', label='Potential Outliers')

    axes[row_index, col_index].legend(loc = "center left")

plt.tight_layout()
plt.show()



Insights:


*   List
*   List item



## Bivariate & Multivariate Analysis
Here is the section to explore, analyze, visualize each variable in relation to the others.

In [None]:
# Visualizing the distribution of the variables with respect to the target variable
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(12, 12))  # 2x4 grid

# Defining the custom color palettes
color_palette = ['#c7e9ff', '#a1d4ff', '#7ac0ff', '#55abff', '#3296ff']

for i, col in enumerate(col_names):
    row_index = i // 2  # Calculate row index
    col_index = i % 2  # Calculate column index

    sns.violinplot(data=train_df, x='Sepssis', y=col, ax=axes[row_index, col_index], palette=color_palette)
    axes[row_index, col_index].set_xlabel('Sepssis')  # Setting xlabel for the specific subplot
    axes[row_index, col_index].set_ylabel(col)
    axes[row_index, col_index].set_title(f'{col} Distribution by Sepssis')

    # Calculate statistics
    positive_vals = train_df[train_df['Sepssis'] == 'Positive'][col]
    negative_vals = train_df[train_df['Sepssis'] == 'Negative'][col]
    stat_dict = {
        'Positive': {
            'Mean': np.mean(positive_vals),
            'Median': np.median(positive_vals),
            '25th Percentile': np.percentile(positive_vals, 25),
            '75th Percentile': np.percentile(positive_vals, 75)
        },
        'Negative': {
            'Mean': np.mean(negative_vals),
            'Median': np.median(negative_vals),
            '25th Percentile': np.percentile(negative_vals, 25),
            '75th Percentile': np.percentile(negative_vals, 75)
        }
    }

    # Add statistics as text annotations
    axes[row_index, col_index].text(0.3, 0.6, f"Positive:\nMean: {stat_dict['Positive']['Mean']:.2f}\nMedian: {stat_dict['Positive']['Median']:.2f}\n25th Percentile: {stat_dict['Positive']['25th Percentile']:.2f}\n75th Percentile: {stat_dict['Positive']['75th Percentile']:.2f}", transform=axes[row_index, col_index].transAxes, color='black',fontsize = 8)
    axes[row_index, col_index].text(0.5, 0.6, f"Negative:\nMean: {stat_dict['Negative']['Mean']:.2f}\nMedian: {stat_dict['Negative']['Median']:.2f}\n25th Percentile: {stat_dict['Negative']['25th Percentile']:.2f}\n75th Percentile: {stat_dict['Negative']['75th Percentile']:.2f}", transform=axes[row_index, col_index].transAxes, color='black', fontsize = 8)

# Adjust layout and display plot
plt.tight_layout()
plt.show()


In [None]:
# Visualizing the correlation between the  numerical features
corr_matrix = train_df.iloc[:,:-2].corr()

#Generating heatmap for the correlation matrix
plt.figure(figsize=(8, 8))
sns.heatmap(data=corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()


* Plasma Glucose of patients shows a relatively stronger positive correlation with the age of the patients.
* Beyond this, all features show a weak correlation with each other.




In [None]:
# Categorical Variables - Bar plots
categorical_vars = ['Insurance']
for var in categorical_vars:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=train_df, x=var, hue='Sepssis', palette=color_palette)
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.title(f'{var} Distribution by Sepssis')

    # Calculate percentage distribution
    total = len(train_df['Sepssis'])
    percentages = train_df['Sepssis'].value_counts(normalize=True) * 100

    # Add data labels and percentage annotations
    for p, percentage in zip(plt.gca().patches, percentages):
        count = p.get_height()
        percentage_label = f'{percentage:.1f}%'
        plt.gca().annotate(f'{count}\n{percentage_label}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

# Feature Processing & Engineering
Here is the section to **clean**, **process** the dataset and **create new features**.

## Drop Duplicates

In [None]:
# Check for duplicate rows in train_df
train_duplicate_rows = train_df.duplicated()
print("Number of duplicate rows in train_df:", train_duplicate_rows.sum())

# Check for duplicate rows in test_df
test_duplicate_rows = test_df.duplicated()
print("Number of duplicate rows in test_df:", test_duplicate_rows.sum())

## Impute Missing Values

In [None]:
# Use pandas.DataFrame.drop_duplicates method
missing_traindf = train_df.isna().sum()
missing_testdf = test_df.isna().sum()
print(missing_traindf, missing_testdf)

### Insights:



*   There are no duplicated rows in both the train and test datasets
*   Again, there are no missing values in both datasets



## Features Encoding

In [None]:
# From sklearn.preprocessing use LabelEncoder to encode the categorical features.
def encode_target_variable(data, target_variable):
    # Encode the target variable using LabelEncoder
    label_encoder = LabelEncoder()
    encoded_target = label_encoder.fit_transform(data[target_variable])
    target_encoded = pd.DataFrame(encoded_target, columns=[target_variable])

    # Combine the features and the encoded target variable
    data_encoded = pd.concat([data.iloc[:, :-1], target_encoded], axis=1)
    data_encoded.drop('ID', axis=1, inplace=True)

    return data_encoded

In [None]:
# Encode target variable in train data
train_df_encoded = encode_target_variable(train_df, 'Sepssis')

# Print the encoded train data
print(train_df_encoded.head())

## Dataset Splitting

In [None]:
def split_data(X, y, test_size, random_state=42, stratify=None):
    # Split the data into train and validation sets
    X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=stratify)

    return X_train, X_eval, y_train, y_eval

# Split the data into train and validation sets for both X and y
X_train, X_eval, y_train, y_eval = split_data(train_df_encoded.iloc[:, :-1], train_df_encoded.iloc[:, -1:], test_size=0.2, random_state=42, stratify=train_df_encoded.iloc[:, -1:])

# Print the shapes of the train and validation sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_eval shape:", X_eval.shape)
print("y_eval shape:", y_eval.shape)

## Features Scaling

In [None]:
scaler  = StandardScaler()

#Applying Scaler on the training set
X_train_scaled = scaler.fit_transform(X_train)

#Applying the scaler on the evaluation set
X_eval_scaled = scaler.transform(X_eval)

## Optional: Train Dataset Balancing

In [None]:
class_counts = y_train.value_counts()
class_counts.index =['O','1']

class_colors = ['green','red']

# Create a bar plot
ax = class_counts.plot(kind='bar',color = class_colors)
plt.title('Class Imbalance', fontsize=11, fontweight='bold')
plt.xlabel('Class', fontsize=9)
plt.ylabel('Count', fontsize=9)

#Rotate the labels on the x_axis
ax.set_xticklabels(class_counts.index, rotation= 0)
plt.show()

Clearly, there's an imbalance in the dataset. The best strategy is to oversample the negative class.

In [None]:
# Use Over-sampling/Under-sampling methods, more details here: https://imbalanced-learn.org/stable/install.html
oversample= SMOTE()
X_train_resampled,y_train_resampled= oversample.fit_resample(X_train_scaled, y_train)
X_train_resampled.shape,y_train_resampled.shape

In [None]:
#Checking to see if the tarhet variables are balanced
y_train_resampled.value_counts()

The target variables are balanced.

# Machine Learning Modeling
Here is the section to **build**, **train**, **evaluate** and **compare** the models to each others.

## Simple Model #001 logistic_regression

In [None]:
#Instantiating the model
lr_model = LogisticRegression(random_state = 42)

#Training the model
lr_model.fit(X_train_resampled,y_train_resampled)

In [None]:
#Predictions on evaluation set
lr_model_preds = lr_model.predict(X_eval_scaled)

# Calculate evaluation metrics
lr_f1_score = f1_score(y_eval, lr_model_preds)
lr_recall = recall_score(y_eval, lr_model_preds)
lr_precision = precision_score(y_eval, lr_model_preds)
lr_f2_score = fbeta_score(y_eval, lr_model_preds, beta=2)
lr_accuracy = accuracy_score(y_eval, lr_model_preds)

# Calculate AUC score
lr_auc_score = roc_auc_score(y_eval, lr_model_preds)

In [None]:
print("F1 Score:", lr_f1_score)
print("Recall Score:", lr_recall)
print("Precision Score:", lr_precision)
print("F2 Score:", lr_f2_score)
print("Accuracy Score:", lr_accuracy)
print("ROC AUC Score:", lr_auc_score)

## Simple Model #002 Decision trees

In [None]:
#Instantiating the model
tree_classifier = DecisionTreeClassifier()

#Training Model
tree_classifier.fit(X_train_resampled,y_train_resampled)

In [None]:
#Predictions on Evaluation set
tree_classifier_preds = tree_classifier.predict(X_eval_scaled)

# Calculate evaluation metrics
tree_classifier_f1_score = f1_score(y_eval, tree_classifier_preds)
tree_classifier_recall = recall_score(y_eval, tree_classifier_preds)
tree_classifier_precision = precision_score(y_eval, tree_classifier_preds)
tree_classifier_f2_score = fbeta_score(y_eval, tree_classifier_preds, beta=2)
tree_classifier_accuracy = accuracy_score(y_eval, tree_classifier_preds)

# Calculate AUC score
tree_classifier_auc_score = roc_auc_score(y_eval, tree_classifier_preds)


In [None]:
print("F1 Score:", tree_classifier_f1_score)
print("Recall Score:", tree_classifier_recall)
print("Precision Score:", tree_classifier_precision)
print("F2 Score:", tree_classifier_f2_score)
print("Accuracy Score:", tree_classifier_accuracy)
print("ROC AUC Score:", tree_classifier_auc_score)

## Simple Model #003 - Random Forest Classifier

In [None]:
#Instantiating the model
rf_classifier = RandomForestClassifier(n_estimators = 10000)

#Training Model
rf_classifier.fit(X_train_resampled,y_train_resampled)

In [None]:
#Predictions on Evaluation set
rf_classifier_preds = rf_classifier.predict(X_eval_scaled)

# Calculate evaluation metrics
rf_classifier_f1_score = f1_score(y_eval, rf_classifier_preds)
rf_classifier_recall = recall_score(y_eval, rf_classifier_preds)
rf_classifier_precision = precision_score(y_eval, rf_classifier_preds)
rf_classifier_f2_score = fbeta_score(y_eval, rf_classifier_preds, beta=2)
rf_classifier_accuracy = accuracy_score(y_eval, rf_classifier_preds)

# Calculate AUC score
rf_classifier_auc_score = roc_auc_score(y_eval, rf_classifier_preds)

In [None]:
print("F1 Score:", rf_classifier_f1_score)
print("Recall Score:", rf_classifier_recall)
print("Precision Score:", rf_classifier_precision)
print("F2 Score:", rf_classifier_f2_score)
print("Accuracy Score:", rf_classifier_accuracy)
print("ROC AUC Score:", rf_classifier_auc_score)

## Simple Model #004 - XGBClassifier

In [None]:
#Instantiating the model
xgb_classifier = XGBClassifier(n_estimators =10000)

#Training Model
xgb_classifier.fit(X_train_resampled,y_train_resampled)

In [None]:
#Predictions on Evaluation set
xgb_classifier_preds = xgb_classifier.predict(X_eval_scaled)

# Calculate evaluation metrics
xgb_classifier_f1_score = f1_score(y_eval, xgb_classifier_preds)
xgb_classifier_recall = recall_score(y_eval, xgb_classifier_preds)
xgb_classifier_precision = precision_score(y_eval, xgb_classifier_preds)
xgb_classifier_f2_score = fbeta_score(y_eval, xgb_classifier_preds, beta=2)
xgb_classifier_accuracy = accuracy_score(y_eval, xgb_classifier_preds)

# Calculate AUC score
xgb_classifier_auc_score = roc_auc_score(y_eval, xgb_classifier_preds)

In [None]:
print("F1 Score:", xgb_classifier_f1_score)
print("Recall Score:", xgb_classifier_recall)
print("Precision Score:", xgb_classifier_precision)
print("F2 Score:", xgb_classifier_f2_score)
print("Accuracy Score:", xgb_classifier_accuracy)
print("ROC AUC Score:", xgb_classifier_auc_score)

## Simple Model #005 - Naive Bayes model

In [None]:
#Instantiating the model
nb_classifier = GaussianNB()

#Training Model
nb_classifier.fit(X_train_resampled,y_train_resampled)

In [None]:
#Predictions on Evaluation set
nb_classifier_preds = nb_classifier.predict(X_eval_scaled)

# Calculate evaluation metrics
nb_classifier_f1_score = f1_score(y_eval, nb_classifier_preds)
nb_classifier_recall = recall_score(y_eval, nb_classifier_preds)
nb_classifier_precision = precision_score(y_eval, nb_classifier_preds)
nb_classifier_f2_score = fbeta_score(y_eval, nb_classifier_preds, beta=2)
nb_classifier_accuracy = accuracy_score(y_eval, nb_classifier_preds)

# Calculate AUC score
nb_classifier_auc_score = roc_auc_score(y_eval, nb_classifier_preds)

In [None]:
print("F1 Score:", nb_classifier_f1_score)
print("Recall Score:", nb_classifier_recall)
print("Precision Score:", nb_classifier_precision)
print("F2 Score:", nb_classifier_f2_score)
print("Accuracy Score:", nb_classifier_accuracy)
print("ROC AUC Score:", nb_classifier_auc_score)

## Simple Model #006 - Stochastic Gradient Descent

In [None]:
#Instantiating the model
sgd_classifier = SGDClassifier()

#Training Model
sgd_classifier.fit(X_train_resampled,y_train_resampled)

In [None]:
#Predictions on Evaluation set
sgd_classifier_preds = sgd_classifier.predict(X_eval_scaled)

# Calculate evaluation metrics
sgd_classifier_f1_score = f1_score(y_eval, sgd_classifier_preds)
sgd_classifier_recall = recall_score(y_eval, sgd_classifier_preds)
sgd_classifier_precision = precision_score(y_eval, sgd_classifier_preds)
sgd_classifier_f2_score = fbeta_score(y_eval, sgd_classifier_preds, beta=2)
sgd_classifier_accuracy = accuracy_score(y_eval, sgd_classifier_preds)

# Calculate AUC score
sgd_classifier_auc_score = roc_auc_score(y_eval, sgd_classifier_preds)

In [None]:
print("F1 Score:", sgd_classifier_f1_score)
print("Recall Score:", sgd_classifier_recall)
print("Precision Score:", sgd_classifier_precision)
print("F2 Score:", sgd_classifier_f2_score)
print("Accuracy Score:", sgd_classifier_accuracy)
print("ROC AUC Score:", sgd_classifier_auc_score)

## Models comparison

In [None]:
results = {'model': ['Logistic Regression','Decision Tree', 'Random Forest','XGBoost','Naive Bayes','Stochastic Gradient Descent'],
           'f1_score': [lr_f1_score, tree_classifier_f1_score,rf_classifier_f1_score,xgb_classifier_f1_score,nb_classifier_f1_score,sgd_classifier_f1_score],
           'Details': ['','','','','','']}

results_df = pd.DataFrame(results)

In [None]:
results_df.sort_values(by = 'f1_score', ascending = False)

The best performing models are the Logistic Regression Model, Random_Forest and Naive Bayes and XGB model

## Hyperparameters tuning

Fine-tune the Top-k models (3 < k < 5) using a ` GridSearchCV`  (that is in sklearn.model_selection
) to find the best hyperparameters and achieve the maximum performance of each of the Top-k models, then compare them again to select the best one.

###### Logistic Regression

In [None]:
# Define the hyperparameters grid
param_grid = {
        'C': [10, 30, 50, 70, 80, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [10, 20, 30, 40, 50]
    }

# Define scoring metrics
scoring = {
        'F1': 'f1',
        'ROC AUC': 'roc_auc'
    }

In [None]:
# Perform grid search to find the best hyperparameters
lr_grid_search = GridSearchCV(estimator = lr_model, param_grid = param_grid, scoring=scoring, cv=5, refit='F1')
lr_grid_search.fit(X_train_resampled, y_train_resampled)

# Create a dictionary to store the results
lr_tuned_results = {
        'best_params': lr_grid_search.best_params_,
        'best_estimator': lr_grid_search.best_estimator_,
        'best_f1_score': lr_grid_search.best_score_,
        'best_roc_auc_score': roc_auc_score(y_train_resampled, lr_grid_search.predict_proba(X_train_resampled)[:, 1])
    }

In [None]:
lr_tuned_results

##### Random Forest Classifier

In [None]:
# Define the hyperparameters grid
param_grid = {
        'n_estimators': [100,1000],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2,5,4],
        'min_samples_leaf': [1,2,4] ,
        'max_features':['auto','sqrt','log2']
    }

# Define scoring metrics
scoring = {
        'F1': 'f1',
        'ROC AUC': 'roc_auc'
    }

In [None]:
# Perform grid search to find the best hyperparameters
rf_grid_search = GridSearchCV(estimator = rf_classifier, param_grid = param_grid, scoring=scoring, cv=5, refit='F1')
rf_grid_search.fit(X_train_resampled, y_train_resampled)

# Create a dictionary to store the results
rf_tuned_results = {
        'best_params': rf_grid_search.best_params_,
        'best_estimator': rf_grid_search.best_estimator_,
        'best_f1_score': rf_grid_search.best_score_,
        'best_roc_auc_score': roc_auc_score(y_train_resampled, rf_grid_search.predict_proba(X_train_resampled)[:, 1])
    }

In [None]:
rf_tuned_results

##### Naive Bayes

In [None]:
# Define the hyperparameters grid
param_grid = {
    'priors': [None, [0.1, 0.3, 0.6]],
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}
# Define scoring metrics
scoring = {
        'F1': 'f1',
        'ROC AUC': 'roc_auc'
    }

In [None]:
# Perform grid search to find the best hyperparameters
nb_grid_search = GridSearchCV(estimator = nb_classifier, param_grid = param_grid, scoring=scoring, cv=5, refit='F1')
nb_grid_search.fit(X_train_resampled, y_train_resampled)

# Create a dictionary to store the results
nb_tuned_results = {
        'best_params': nb_grid_search.best_params_,
        'best_estimator': nb_grid_search.best_estimator_,
        'best_f1_score': nb_grid_search.best_score_,
        'best_roc_auc_score': roc_auc_score(y_train_resampled, nb_grid_search.predict_proba(X_train_resampled)[:, 1])
    }

In [None]:
nb_tuned_results

#### XGB Classifier

In [None]:
# Define the hyperparameters grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200, 300]
}

# Define scoring metrics
scoring = {
        'F1': 'f1',
        'ROC AUC': 'roc_auc'
    }

In [None]:
# Perform grid search to find the best hyperparameters
xgb_grid_search = GridSearchCV(estimator = xgb_classifier, param_grid = param_grid, scoring=scoring, cv=5, refit='F1')
xgb_grid_search.fit(X_train_resampled, y_train_resampled)

# Create a dictionary to store the results
xgb_tuned_results = {
        'best_params': xgb_grid_search.best_params_,
        'best_estimator': xgb_grid_search.best_estimator_,
        'best_f1_score': xgb_grid_search.best_score_,
        'best_roc_auc_score': roc_auc_score(y_train_resampled, nb_grid_search.predict_proba(X_train_resampled)[:, 1])
    }

In [None]:
xgb_tuned_results

## Hyperparameter tuning results comparison

In [None]:

results_new= {'model':['Logistic_Regression','Random Forest','SGD Classifier','XGB Classifier'],
         'f1_score':[lr_tuned_results['best_f1_score'],rf_tuned_results['best_f1_score'],nb_tuned_results['best_f1_score'],xgb_tuned_results['best_f1_score']],
         'AUC_score':[lr_tuned_results['best_roc_auc_score'],rf_tuned_results['best_roc_auc_score'],nb_tuned_results['best_roc_auc_score'],xgb_tuned_results['best_roc_auc_score']]}

results_new_df= pd.DataFrame(results_new)

results_new_df.sort_values(by= 'f1_score', ascending = False)

From the table above , the best performing model is the Random Forest Classifier.


### Evaluation on Test Data

In [None]:
model = rf_tuned_results['best_estimator']
model

In [None]:
#Preprocessing  the test_data
X_test_new = test_df.drop('ID', axis =1)
X_test_scaled = scaler.transform(X_test_new)


In [None]:
#Running Predictions on the test data
test_preds = model.predict(X_test_scaled)
test_preds

In [None]:
#Saving Test Presdictions
test_predictions = pd.DataFrame(test_preds, columns = ['Sepsis'])
test_predictions.to_csv('test_predictions.csv')

# Export Key Components

In [None]:
# Get the current working directory
cwd = os.getcwd()

# Specify the relative path to the destination directory
destination = os.path.join(cwd, "Assets")

# Create the "export" directory if it doesn't exist
os.makedirs(destination, exist_ok=True)

# Export the scaler
scaler_filepath = os.path.join(destination, "scaler.joblib")
dump(scaler, scaler_filepath)

# Export the random forest classifier
model_filepath = os.path.join(destination, "model.joblib")
dump(model, model_filepath)

# Print the paths to the exported components
print(f"Scaler exported to: {scaler_filepath}")
print(f"Random Classifier exported to: {model_filepath}")

In [None]:
#Exporting all libraries
!pip list --format=freeze >Assets/requirements.txt