# Importing Libraries

In [1]:
import pandas as pd
import os
import pickle
import numpy as np
import warnings
from datetime import datetime
import time

In [2]:
import mlflow
from mlflow.exceptions import MlflowException

from mlflow.tracking import MlflowClient

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [3]:
import matplotlib.pyplot as plt

In [4]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline

import xgboost as xgb

import torch
import torch.nn as nn
import torch.optim as optim

# Initiating MLflow

In [112]:
TRACKING_SERVER_HOST = 'EC2_endpoint' # update the endpoint after reboot!!!

# mlflow.set_tracking_uri("sqlite:///mlflow.db") # for local machine
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
model_name = "xgboost_admission_prediction"
mlflow.set_experiment(model_name)

# initiate Mlflowclient
client = MlflowClient()

In [113]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-13-212-244-196.ap-southeast-1.compute.amazonaws.com:5000'


In [114]:
mlflow.search_experiments()

[<Experiment: artifact_location='s3://mlopszoomcamp24-mlflow-artifacts/5', creation_time=1723234445164, experiment_id='5', last_update_time=1723234445164, lifecycle_stage='active', name='xgboost_admission_prediction', tags={}>,
 <Experiment: artifact_location='s3://mlopszoomcamp24-mlflow-artifacts/3', creation_time=1723156401255, experiment_id='3', last_update_time=1723156401255, lifecycle_stage='active', name='admission_prediction', tags={}>,
 <Experiment: artifact_location='s3://mlopszoomcamp24-mlflow-artifacts/1', creation_time=1722006211099, experiment_id='1', last_update_time=1722006211099, lifecycle_stage='active', name='flight_delay_prediction', tags={}>]

# Defining Features

In [8]:
path = "../data/triage.csv"

In [9]:
# Defining numerical and categorical features

numerical_features = ["age", "albumin_last", "albumin_max", "albumin_median", "albumin_min", "bloodculture,routine_count",  "bloodculture,routine_last",
                      "bloodculture,routine_npos", "cc_abdominalcramping", "cc_abdominaldistention", "cc_abdominalpain", "cc_abdominalpainpregnant",
                      "cc_allergicreaction", "cc_bleeding/bruising", "cc_breastpain", "cc_chestpain", "cc_confusion", "cc_diarrhea",
                      "cc_dizziness", "cc_fall>65", "cc_fever", "cc_hallucinations", "cc_headache", "cc_hypertension", "cc_hypotension",
                      "cc_irregularheartbeat", "cc_nausea", "cc_overdose-accidental", "cc_overdose-intentional", "cc_poisoning", "cc_rapidheartrate",
                      "cc_rectalbleeding", "cc_strokealert", "cc_unresponsive", "cc_urinaryretention", "cktotal_last", "cktotal_max",
                      "cktotal_median", "cktotal_min", "d-dimer_last", "d-dimer_max", "d-dimer_median", "d-dimer_min", "esi", "n_admissions", "n_edvisits", "n_surgeries", "platelets_last", "platelets_max", "platelets_median", "platelets_min",
                      "rbc_last", "rbc_max", "rbc_median", "rbc_min", "triage_vital_dbp", "triage_vital_hr", "triage_vital_o2",
                      "triage_vital_o2_device", "triage_vital_rr", "triage_vital_sbp", "triage_vital_temp", "troponini(poc)_last", "troponini(poc)_max",
                      "troponini(poc)_median", "troponini(poc)_min", "troponint_last", "troponint_max", "troponint_median", "troponint_min",
                      "urineculture,routine_count", "urineculture,routine_last", "urineculture,routine_npos", "viralinfect", "wbc_last",
                      "wbc_max", "wbc_median", "wbc_min"]

categorical_features = ['arrivalmode', 'gender', 'previousdispo']

target = 'disposition' # admit or discharge converted into 1 or 0

# Defining Functions

In [117]:
# Functions for data processing

def df_prepare(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
    
    for c in categorical_columns:
        df[c] = df[c].str.lower().str.replace(' ', '_')
        df[c] = df[c].str.replace(',', '')
        df[c] = df[c].str.replace(':', '')

    return df

def prepare_dictionaries(df, numerical_features, categorical_features):
    df[numerical_features] = df[numerical_features].fillna(0)
    df = df.dropna(subset=categorical_features)
    
    # Use .loc to avoid SettingWithCopyWarning
    df.loc[:, categorical_features] = df[categorical_features].astype(str)

    return df[numerical_features + categorical_features].to_dict(orient='records')

def split_dataset(path, numerical_features, categorical_features, target):
    df = df_prepare(path)
    
    # Ensure consistent dropping of rows with missing data before splitting
    df[numerical_features] = df[numerical_features].fillna(0)
    df = df.dropna(subset=categorical_features + [target])
    
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    valid_df, test_df = train_test_split(temp_df, test_size=0.333, random_state=42) # 0.333 * 0.3 ≈ 0.1

    train_dicts = prepare_dictionaries(train_df, numerical_features, categorical_features)
    valid_dicts = prepare_dictionaries(valid_df, numerical_features, categorical_features)
    test_dicts = prepare_dictionaries(test_df, numerical_features, categorical_features)
    
    y_train = train_df[target].apply(lambda x: 1 if x == 'admit' else 0).values.astype(int)
    y_valid = valid_df[target].apply(lambda x: 1 if x == 'admit' else 0).values.astype(int)
    y_test = test_df[target].apply(lambda x: 1 if x == 'admit' else 0).values.astype(int)
    
    return train_dicts, y_train, valid_dicts, y_valid, test_dicts, y_test


def train_and_log_xgboost(train_dicts, y_train, valid_dicts, y_valid, test_dicts, y_test, params):
    with mlflow.start_run() as run:
        mlflow.set_tag("model", "xgboost")
        
        # Log the parameters
        mlflow.log_params(params)
        
        # Create the pipeline with DictVectorizer and StandardScaler
        pipeline = make_pipeline(
            DictVectorizer(),
            StandardScaler(with_mean=False),
            xgb.XGBClassifier(**params)
        )
        
        # Fit the pipeline on the training data
        pipeline.fit(train_dicts, y_train)
        
        # Cross-validation accuracy on the training data
        accuracies = cross_val_score(estimator=pipeline, X=train_dicts, y=y_train, cv=10)
        cross_val_mean = accuracies.mean()
        mlflow.log_metric("cross_val_mean_accuracy", cross_val_mean)
        
        # Validation accuracy
        y_valid_pred = pipeline.predict(valid_dicts)
        valid_accuracy = accuracy_score(y_valid, y_valid_pred)
        mlflow.log_metric("valid_accuracy", valid_accuracy)
        
        # Test accuracy
        y_test_pred = pipeline.predict(test_dicts)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        mlflow.log_metric("test_accuracy", test_accuracy)
        
        # Log the entire pipeline as a model
        mlflow.sklearn.log_model(pipeline, "xgboost_pipeline_model")
        
        # Optionally, log the model with the XGBoost flavor as well
        xgb_model = pipeline.named_steps['xgbclassifier']
        mlflow.xgboost.log_model(xgb_model, artifact_path="xgboost_model")

        # Register the model
        trained_model = f'{mlflow.search_experiments()[0].artifact_location}/{run.info.run_id}/artifacts/xgboost_pipeline_model'
        registered_model = mlflow.register_model(model_uri=trained_model, name=model_name)
        
        # Wait for the model version to be created
        latest_version = registered_model.version
        
        # Introduce a short delay to ensure the alias is properly assigned
        time.sleep(5)
        
        # Optionally, tag the version with additional information
        date = datetime.today().date()
        client.set_model_version_tag(
            name=model_name,
            version=latest_version,
            key="deployment_date",
            value=str(date)
        )
        
        return valid_accuracy, test_accuracy, run.info.run_id


# Data Processing

In [11]:
# Split the dataset to train/validation/test
train_dicts, y_train, valid_dicts, y_valid, test_dicts, y_test = split_dataset(path, numerical_features, categorical_features, target)

# Model Training and Registry

In [103]:
# training XGBoost with best param
best_param = {
    "objective": 'binary:logistic',
    "n_estimators": 100,
    "max_depth": 8,
    "learning_rate": 0.19,
    "min_child_weight": 5.0,
    "gamma": 0.32,
    "subsample": 0.79,
    "colsample_bytree": 0.85
}

xgb_val, xgb_test, xgb_run_id = train_and_log_xgboost(train_dicts, y_train, valid_dicts, y_valid, test_dicts, y_test)



In [118]:
# XGBoost parameters
params1 = {
    "objective": 'binary:logistic',
    "n_estimators": 100,
    "max_depth": 8,
    "learning_rate": 0.19,
    "min_child_weight": 5.0,
    "gamma": 0.32,
    "subsample": 0.79,
    "colsample_bytree": 0.85
}

params2 = {
    "objective": 'binary:logistic',
    "n_estimators": 100,
    "max_depth": 0,
    "learning_rate": 0.19,
    "min_child_weight": 5.0,
    "gamma": 0.32,
    "subsample": 0.79,
    "colsample_bytree": 0.85
}

params3 = {
    "objective": 'binary:logistic',
    "n_estimators": 100,
    "max_depth": 8,
    "learning_rate": 0.8,
    "min_child_weight": 5.0,
    "gamma": 0.32,
    "subsample": 0.79,
    "colsample_bytree": 0.85
}

In [119]:
for param in [params1, params2, params3]:
    xgb_val, xgb_test, xgb_run_id = train_and_log_xgboost(train_dicts, y_train, valid_dicts, y_valid, test_dicts, y_test, params=param)

Successfully registered model 'xgboost_admission_prediction'.
2024/08/09 21:15:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_admission_prediction, version 1
Created version '1' of model 'xgboost_admission_prediction'.
Registered model 'xgboost_admission_prediction' already exists. Creating a new version of this model...
2024/08/09 21:21:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_admission_prediction, version 2
Created version '2' of model 'xgboost_admission_prediction'.
Registered model 'xgboost_admission_prediction' already exists. Creating a new version of this model...
2024/08/09 21:24:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_admission_prediction, version 3
Created version '3' of model 'xgboost_admission_predi

# Testing Trained Models And Promoting The Best Performing For Production

In [None]:
# Function to test a model version with a pipeline
def test_model(logged_model, test_dicts, y_test):
    try:
        # Load the model as a pipeline from the given URI
        pipeline = mlflow.sklearn.load_model(logged_model)
        
        # Make predictions on the test data
        y_test_pred = pipeline.predict(test_dicts)
        
        # Calculate test accuracy
        test_accuracy = accuracy_score(y_test, y_test_pred)
        print(f"Model URI: {logged_model} - Accuracy: {test_accuracy}")
        return test_accuracy
    except MlflowException as e:
        print(f"Model URI: {logged_model} not found: {e}")
        return 0

# Initialize MLflow client
client = MlflowClient()

# List all model versions and test their accuracy
versions = client.search_model_versions(f"name='{model_name}'")
accuracies = {}
for version in versions:
    # Get the source URI of the model version
    model_uri = version.source
    
    # Test the model and store its accuracy
    accuracy = test_model(logged_model=model_uri, test_dicts=test_dicts, y_test=y_test)
    accuracies[version.version] = accuracy

# Determine the best version
best_version = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_version]

# Update aliases
for version in versions:
    if version.version == best_version:
        client.set_registered_model_alias(
            name=model_name,
            alias="Production",
            version=version.version
        )
        client.update_model_version(
            name=model_name,
            version=version.version,
            description=f"The model version {version.version} was assigned alias 'Production' on {date} with accuracy {best_accuracy}"
        )
        print(f"Set alias 'Production' for model version {version.version}")
    else:
        client.set_registered_model_alias(
            name=model_name,
            alias="Staging",
            version=version.version
        )
        print(f"Set alias 'Staging' for model version {version.version}")

# Print final accuracies and versions
print("Model Accuracies:", accuracies)
print(f"Best Model Version: {best_version} with Accuracy: {best_accuracy}")