In [None]:
import warnings
from pprint import pprint

import pandas as pd
import mlflow
from mlflow.models import infer_signature

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
import os

dotenv_path = "../.env"
load_dotenv(dotenv_path)

MLFLOW_TRACKING_SERVER_HOST = os.getenv("MLFLOW_TRACKING_SERVER_HOST")
MLFLOW_TRACKING_SERVER_PORT = os.getenv("MLFLOW_TRACKING_SERVER_PORT")
MLFLOW_MODEL_NAME = os.getenv("MLFLOW_MODEL_NAME")
MLFLOW_MODEL_ALIAS = os.getenv("MLFLOW_MODEL_ALIAS")

MINIO_HOST = os.getenv('MINIO_HOST')
MINIO_API_PORT = os.getenv('MINIO_API_PORT')



# Minio


In [3]:
# Set MinIO Credentials
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("MINIO_ACCESS_KEY")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("MINIO_SECRET_ACCESS_KEY")
os.environ["AWS_DEFAULT_REGION"] = os.getenv("AWS_REGION")

os.environ["MLFLOW_S3_ENDPOINT_URL"] = (
    f"http://{MINIO_HOST}:{MINIO_API_PORT}"
)

# Test if credentials are set correctly
import boto3

s3 = boto3.client(
    "s3",
    endpoint_url=os.getenv("MLFLOW_S3_ENDPOINT_URL"),
)

# MLFlow


In [4]:
import mlflow

mlflow.set_tracking_uri(uri=f"http://{MLFLOW_TRACKING_SERVER_HOST}:{MLFLOW_TRACKING_SERVER_PORT}")
mlflow.set_experiment("4th year project")

2025/03/04 23:58:45 INFO mlflow.tracking.fluent: Experiment with name '4th year project' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow/1', creation_time=1741121925903, experiment_id='1', last_update_time=1741121925903, lifecycle_stage='active', name='4th year project', tags={}>

In [5]:
df = pd.read_csv("data.csv")

In [None]:
categorical_columns = [
    "gender",
    "pregnancy_status",
    "known_allergy",
    "rechallenge",
    "dechallenge",
    "severity",
    "is_serious",
    "criteria_for_seriousness",
    "action_taken",
    "outcome",
]

target_column = "causality_assessment_level"

In [7]:
from sklearn.model_selection import train_test_split

X = df[[*categorical_columns]]
y = df[target_column]


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

del X_train
del X_test
del y_train
del y_test

In [8]:
# from helper_functions import categorical_eda

# categorical_eda(df=train_df, columns=categorical_columns, target_column=target_column)

In [9]:
import joblib

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


one_hot_encoder = OneHotEncoder(sparse_output=False)

# train_df_cat_encoded = pd.get_dummies(train_df[categorical_columns])
train_df_cat_encoded = one_hot_encoder.fit_transform(train_df[categorical_columns])


train_df_cat_encoded = pd.DataFrame(
    train_df_cat_encoded,
    columns=one_hot_encoder.get_feature_names_out(categorical_columns),
)


# test_df_cat_encoded = pd.get_dummies(test_df[categorical_columns])
test_df_cat_encoded = one_hot_encoder.transform(test_df[categorical_columns])
test_df_cat_encoded = pd.DataFrame(
    test_df_cat_encoded,
    columns=one_hot_encoder.get_feature_names_out(categorical_columns),
)
# target_column_mappings = {
#     "certain": 1,
#     "likely": 2,
#     "possible": 3,
#     "unlikely": 4,
#     "unclassified": 5,
#     "unclassifiable": 6,
# }

ordinal_encoder = OrdinalEncoder(
    categories=[
        ["certain", "likely", "possible", "unlikely", "unclassified", "unclassifiable"]
    ]
)

train_target_column_encoded = pd.DataFrame()
# train_target_column_encoded[target_column] = train_df[target_column].map(
#     target_column_mappings
# )
train_target_column_encoded[target_column] = ordinal_encoder.fit_transform(
    train_df[[target_column]]
).ravel()

test_target_column_encoded = pd.DataFrame()
# test_target_column_encoded[target_column] = test_df[target_column].map(
#     target_column_mappings
# )

test_target_column_encoded[target_column] = ordinal_encoder.transform(
    test_df[[target_column]]
).ravel()

temp_artifacts_path = "temp_artifacts"
one_hot_encoder_path = f"{temp_artifacts_path}/one_hot_encoder.pkl"
ordinal_encoder_path = f"{temp_artifacts_path}/ordinal_encoder.pkl"
joblib.dump(one_hot_encoder, one_hot_encoder_path)
joblib.dump(ordinal_encoder, ordinal_encoder_path)


['temp_artifacts/ordinal_encoder.pkl']

# Feature


In [10]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

rfecv_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

rfecv = RFECV(
    estimator=DecisionTreeClassifier(),
    step=1,
    min_features_to_select=1,
    cv=rfecv_cv,
    scoring="f1_weighted",
    n_jobs=-1,
    verbose=2,
)

prediction_columns = [
    "rechallenge_yes",
    "rechallenge_no",
    "rechallenge_unknown",
    "rechallenge_na",
    "dechallenge_yes",
    "dechallenge_no",
    "dechallenge_unknown",
    "dechallenge_na",
]

X = train_df_cat_encoded[prediction_columns]
y = train_target_column_encoded

rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Optimal features: {list(rfecv.get_feature_names_out())}")
# print(rfecv.cv_results_)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator wi

In [11]:
# X = train_df_cat_encoded[list(rfecv.get_feature_names_out())]
X = train_df_cat_encoded[prediction_columns]
y = train_target_column_encoded


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)


# X_test = test_df_cat_encoded[list(rfecv.get_feature_names_out())]
X_test = test_df_cat_encoded[prediction_columns]
y_test = test_target_column_encoded

In [12]:
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier

estimators_and_param_grids = {
    "decision_tree": (
        DecisionTreeClassifier(random_state=42),
        {"max_depth": [25, 50, 100]},
    ),
    "ada_boost": (
        AdaBoostClassifier(),
        {
            "n_estimators": [10, 50],
            "learning_rate": [0.1, 1.0],
        },
    ),
    # "xgb": (
    #     xgb.XGBClassifier(scale_pos_weight=1),
    #     {
    #         "n_estimators": [10, 50],
    #         "learning_rate": [0.1, 1.0],
    #         "min_child_weight": [1, 5],
    #         "gamma": [0.5, 1, 1.5],
    #         "subsample": [0.6, 0.8, 1.0],
    #         "colsample_bytree": [0.6, 0.8, 1.0],
    #         "max_depth": [3, 4, 5],
    #     },
    # ),
}


best_estimators = dict()
best_estimators_and_best_param_grids = dict()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

for estimator_name, estimator_and_param_grid in estimators_and_param_grids.items():
    (estimator, param_grid) = estimator_and_param_grid

    # search = GridSearchCV(
    #     estimator=estimator, param_grid=param_grid, scoring="f1", cv=cv, verbose=2
    # )
    print(estimator_name)
    print("---" * 50)

    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_grid,
        cv=cv,
        n_iter=100,
        scoring="f1_weighted",
        verbose=2,
        n_jobs=-1,
    )

    search.fit(X_train, y_train)

    best_estimator = search.best_estimator_
    # best_estimators[estimator_name] = best_estimator
    best_param_grid = search.best_params_

    best_estimators_and_best_param_grids[estimator_name] = (
        best_estimator,
        best_param_grid,
    )
    # print(estimator_name)
    # pprint(search.best_params_)

    # # Evaluate the model on the test data
    # accuracy = best_estimator.score(X_val, y_val)

    # y_pred = best_estimator.predict(X_val)

    # print("Accuracy:", accuracy)

    # classification_report = classification_report(y_val, y_pred)
    # report_dict = classification_report(y_val, y_pred, output_dict=True)

    # print(classification_report)

    print("-----" * 50)


decision_tree
------------------------------------------------------------------------------------------------------------------------------------------------------
Fitting 15 folds for each of 3 candidates, totalling 45 fits
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END ....................................

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................le

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s

[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [13]:
# for estimator_name, estimator in best_estimators.items():
#     print(f"Classification Report for {estimator_name}")
#     print("=" * 80)


#     # Make predictions on the test set
#     y_pred = estimator.predict(X_test)

#     # Print classification report
#     classification_report = classification_report(y_test, y_pred)
#     print(classification_report)

#     print("=" * 80, "\n")

In [14]:
(best_estimator, _) = best_estimators_and_best_param_grids["decision_tree"]

accuracy = best_estimator.score(X_val, y_val)

y_pred = best_estimator.predict(X_val)

pprint(classification_report(y_val, y_pred, output_dict=True))


{'0.0': {'f1-score': 0.8571428571428571,
         'precision': 1.0,
         'recall': 0.75,
         'support': 8.0},
 '1.0': {'f1-score': 0.9064748201438849,
         'precision': 0.8873239436619719,
         'recall': 0.9264705882352942,
         'support': 68.0},
 '2.0': {'f1-score': 0.8285714285714286,
         'precision': 0.7532467532467533,
         'recall': 0.9206349206349206,
         'support': 63.0},
 '3.0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 21.0},
 '4.0': {'f1-score': 0.7543859649122807,
         'precision': 0.6056338028169014,
         'recall': 1.0,
         'support': 43.0},
 '5.0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 22.0},
 'accuracy': 0.7555555555555555,
 'macro avg': {'f1-score': 0.5577625117950752,
               'precision': 0.5410340832876045,
               'recall': 0.5995175848117025,
               'support': 225.0},
 'weighted avg': {'f1-score': 0.6806045649695781,
                  'precision': 0.63037

# SHAP

In [None]:
from mlflow.models import infer_signature
import mlflow
import json

for (
    estimator_name,
    best_estimator_and_best_param_grid,
) in best_estimators_and_best_param_grids.items():
    (best_estimator, best_param_grid) = best_estimator_and_best_param_grid
    # Evaluate the model on the test data
    accuracy = best_estimator.score(X_val, y_val)

    y_pred = best_estimator.predict(X_val)

    report = classification_report(y_val, y_pred)
    report_dict = classification_report(y_val, y_pred, output_dict=True)
    class_report_dict = dict()
    report_renaming_map = {
        "0.0": "certain",
        "1.0": "likely",
        "2.0": "possible",
        "3.0": "unlikely",
        "4.0": "unclassified",
        "5.0": "unclassifiable",
    }

    for key in report_dict.keys():
        if key in report_renaming_map.keys():
            class_report_dict[report_renaming_map[key]] = report_dict[key]
    
    with mlflow.start_run(run_name=estimator_name):
        # Parameters
        mlflow.log_param("estimator_name", estimator_name)
        mlflow.log_params(param_grid)

        # # Convert params to JSON and save as an artifact
        # params_file_path = f"{temp_artifacts_path}/params.json"
        # with open(params_file_path, "w") as f:
        #     json.dump(param_grid, f, indent=4)

        # Metrics
        metrics_dict = {
            "accuracy": report_dict["accuracy"],
            "macro_avg_f1-score": report_dict["macro avg"]["f1-score"],
            "recall_class_certain": class_report_dict["certain"]["recall"],
            "recall_class_likely": class_report_dict["likely"]["recall"],
            "recall_class_possible": class_report_dict["possible"]["recall"],
            "recall_class_unlikely": class_report_dict["unlikely"]["recall"],
            "recall_class_unclassified": class_report_dict["unclassified"]["recall"],
            "recall_class_unclassifiable": class_report_dict["unclassifiable"][
                "recall"
            ],
        }
        mlflow.log_metrics(metrics_dict)

        # # Convert metrics to JSON and save as an artifact
        # metrics_file_path = f"{temp_artifacts_path}/metrics.json"
        # with open(metrics_file_path, "w") as f:
        #     json.dump(metrics_dict, f, indent=4)

        # Artifacts
        mlflow.log_artifact(one_hot_encoder_path, artifact_path="encoders")
        mlflow.log_artifact(ordinal_encoder_path, artifact_path="encoders")
        # mlflow.log_artifact(metrics_file_path, artifact_path="model_metadata")
        # mlflow.log_artifact(params_file_path, artifact_path="model_metadata")

        # Model
        signature = infer_signature(X_train, best_estimator.predict(X_train))
        # mlflow.sklearn.log_model(best_estimator, estimator_name, signature=signature)
        # Log and register the model
        model_info = mlflow.sklearn.log_model(
            sk_model=best_estimator,
            artifact_path="model",
            signature=signature,
            registered_model_name=MLFLOW_MODEL_NAME,  # Register the model with a fixed name
        )

        # Set alias 'champion' to this model version
        client = mlflow.MlflowClient()
        latest_version = client.get_latest_versions(MLFLOW_MODEL_NAME, stages=["None"])[
            0
        ].version  # Get the latest version
        client.set_registered_model_alias(
            MLFLOW_MODEL_NAME, MLFLOW_MODEL_ALIAS, latest_version
        )  # Assign alias

Successfully registered model 'final_ml_model'.
2025/03/04 23:59:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: final_ml_model, version 1
Created version '1' of model 'final_ml_model'.


🏃 View run decision_tree at: http://127.0.0.1:5001/#/experiments/1/runs/26219abcbf764e22967f9e2e23a382dd
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Registered model 'final_ml_model' already exists. Creating a new version of this model...
2025/03/04 23:59:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: final_ml_model, version 2


🏃 View run ada_boost at: http://127.0.0.1:5001/#/experiments/1/runs/27402c19a0c34cfeb9c7f7fe059ebc57
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Created version '2' of model 'final_ml_model'.


In [16]:
# import os

# import joblib
# from dotenv import load_dotenv

# load_dotenv()

# ML_MODEL_PATH = os.getenv("ML_MODEL_PATH")
# joblib.dump(best_estimators_and_best_param_grids["decision_tree"][0], ML_MODEL_PATH)

In [17]:
# run_id = input("Run ID: ").strip()
# model_name = input("Model Name: ").strip()
# model_uri = f"runs:/{run_id}/{model_name}"

# result = mlflow.register_model(model_uri, model_name)