In [49]:
import warnings
from pprint import pprint

import pandas as pd
import mlflow
from mlflow.models import infer_signature

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

# MLFlow

In [50]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8081")
mlflow.set_experiment("4th year project")

<Experiment: artifact_location='/home/kraigochieng/school/4.1/416_project/code/server/mlruns/1', creation_time=1740647300872, experiment_id='1', last_update_time=1740647300872, lifecycle_stage='active', name='4th year project', tags={}>

In [51]:
df = pd.read_csv("data.csv")

In [52]:
categorical_columns = [
    "gender",
    "pregnancy_status",
    "known_allergy",
    "rechallenge",
    "dechallenge",
    "severity",
    "is_serious",
    "criteria_for_seriousness",
    "action_taken",
    "outcome",
]

target_column = "class"

In [53]:
from sklearn.model_selection import train_test_split

X = df[[*categorical_columns]]
y = df[target_column]


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

del X_train
del X_test
del y_train
del y_test

In [54]:
# from helper_functions import categorical_eda

# categorical_eda(df=train_df, columns=categorical_columns, target_column=target_column)

In [55]:
import os

import joblib
from dotenv import load_dotenv
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

load_dotenv()

ENCODERS_PATH = os.getenv("ENCODERS_PATH")

one_hot_encoder = OneHotEncoder(sparse_output=False)

# train_df_cat_encoded = pd.get_dummies(train_df[categorical_columns])
train_df_cat_encoded = one_hot_encoder.fit_transform(train_df[categorical_columns])


train_df_cat_encoded = pd.DataFrame(
    train_df_cat_encoded,
    columns=one_hot_encoder.get_feature_names_out(categorical_columns),
)


# test_df_cat_encoded = pd.get_dummies(test_df[categorical_columns])
test_df_cat_encoded = one_hot_encoder.transform(test_df[categorical_columns])
test_df_cat_encoded = pd.DataFrame(
    test_df_cat_encoded,
    columns=one_hot_encoder.get_feature_names_out(categorical_columns),
)
# target_column_mappings = {
#     "certain": 1,
#     "likely": 2,
#     "possible": 3,
#     "unlikely": 4,
#     "unclassified": 5,
#     "unclassifiable": 6,
# }

ordinal_encoder = OrdinalEncoder(
    categories=[
        ["certain", "likely", "possible", "unlikely", "unclassified", "unclassifiable"]
    ]
)

train_target_column_encoded = pd.DataFrame()
# train_target_column_encoded[target_column] = train_df[target_column].map(
#     target_column_mappings
# )
train_target_column_encoded[target_column] = ordinal_encoder.fit_transform(
    train_df[[target_column]]
).ravel()

test_target_column_encoded = pd.DataFrame()
# test_target_column_encoded[target_column] = test_df[target_column].map(
#     target_column_mappings
# )

test_target_column_encoded[target_column] = ordinal_encoder.transform(
    test_df[[target_column]]
).ravel()

joblib.dump(one_hot_encoder, f"{ENCODERS_PATH}/one_hot_encoder.pkl")
joblib.dump(ordinal_encoder, f"{ENCODERS_PATH}/ordinal_encoder.pkl")


['encoders/ordinal_encoder.pkl']

# Feature


In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

rfecv_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

rfecv = RFECV(
    estimator=DecisionTreeClassifier(),
    step=1,
    min_features_to_select=1,
    cv=rfecv_cv,
    scoring="f1_weighted",
    n_jobs=-1,
    verbose=2,
)

prediction_columns = [
    "rechallenge_yes",
    "rechallenge_no",
    "rechallenge_unknown",
    "rechallenge_na",
    "dechallenge_yes",
    "dechallenge_no",
    "dechallenge_unknown",
    "dechallenge_na",
]

X = train_df_cat_encoded[prediction_columns]
y = train_target_column_encoded

rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Optimal features: {list(rfecv.get_feature_names_out())}")
# print(rfecv.cv_results_)

Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 8 features.Fitting estimator with 6 features.
Fitting estimator with 6 features.
Fitting estimator with 6 features.

Fitting estimator with 8 features.
Fitting estimator with 6 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 7 features.
Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 7 features.
Fitting estimator with 4 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 4 features.
Fitting estimator wi

In [57]:
# X = train_df_cat_encoded[list(rfecv.get_feature_names_out())]
X = train_df_cat_encoded[prediction_columns]
y = train_target_column_encoded


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)


# X_test = test_df_cat_encoded[list(rfecv.get_feature_names_out())]
X_test = test_df_cat_encoded[prediction_columns]
y_test = test_target_column_encoded

In [64]:
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier

estimators_and_param_grids = {
    "decision_tree": (
        DecisionTreeClassifier(random_state=42),
        {"max_depth": [25, 50, 100]},
    ),
    "ada_boost": (
        AdaBoostClassifier(),
        {
            "n_estimators": [10, 50],
            "learning_rate": [0.1, 1.0],
        },
    ),
    # "xgb": (
    #     xgb.XGBClassifier(scale_pos_weight=1),
    #     {
    #         "n_estimators": [10, 50],
    #         "learning_rate": [0.1, 1.0],
    #         "min_child_weight": [1, 5],
    #         "gamma": [0.5, 1, 1.5],
    #         "subsample": [0.6, 0.8, 1.0],
    #         "colsample_bytree": [0.6, 0.8, 1.0],
    #         "max_depth": [3, 4, 5],
    #     },
    # ),
}


best_estimators = dict()
best_estimators_and_best_param_grids = dict()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

for estimator_name, estimator_and_param_grid in estimators_and_param_grids.items():
    (estimator, param_grid) = estimator_and_param_grid

    # search = GridSearchCV(
    #     estimator=estimator, param_grid=param_grid, scoring="f1", cv=cv, verbose=2
    # )
    print(estimator_name)
    print("---" * 50)

    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_grid,
        cv=cv,
        n_iter=100,
        scoring="f1_weighted",
        verbose=2,
        n_jobs=-1,
    )

    search.fit(X_train, y_train)

    best_estimator = search.best_estimator_
    # best_estimators[estimator_name] = best_estimator
    best_param_grid = search.best_params_

    best_estimators_and_best_param_grids[estimator_name] = (
        best_estimator,
        best_param_grid,
    )
    # print(estimator_name)
    # pprint(search.best_params_)

    # # Evaluate the model on the test data
    # accuracy = best_estimator.score(X_val, y_val)

    # y_pred = best_estimator.predict(X_val)

    # print("Accuracy:", accuracy)

    # classification_report = classification_report(y_val, y_pred)
    # report_dict = classification_report(y_val, y_pred, output_dict=True)

    # print(classification_report)

    print("-----" * 50)


decision_tree
------------------------------------------------------------------------------------------------------------------------------------------------------
Fitting 15 folds for each of 3 candidates, totalling 45 fits


[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.1s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.1s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=25; total time=   0.0s
[CV] END .......................................max_depth=50; total time=   0.0s
[CV] END .......................................max_depth=50; total time=   0.0s
[CV] END ...................

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=10; total time=   0.1s
[CV] END .................le

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s

[CV] END .................learning_rate=0.1, n_estimators=50; total time=   0.3s
[CV] END .................le

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=10; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.4s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.4s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.4s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.2s
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [65]:
# for estimator_name, estimator in best_estimators.items():
#     print(f"Classification Report for {estimator_name}")
#     print("=" * 80)

    
#     # Make predictions on the test set
#     y_pred = estimator.predict(X_test)

#     # Print classification report
#     classification_report = classification_report(y_test, y_pred)
#     print(classification_report)

#     print("=" * 80, "\n")

In [66]:
(best_estimator, _) = best_estimators_and_best_param_grids["decision_tree"]

accuracy = best_estimator.score(X_val, y_val)

y_pred = best_estimator.predict(X_val)

pprint(classification_report(y_val, y_pred, output_dict=True))


{'0.0': {'f1-score': 0.9411764705882353,
         'precision': 1.0,
         'recall': 0.8888888888888888,
         'support': 9.0},
 '1.0': {'f1-score': 0.8840579710144928,
         'precision': 0.953125,
         'recall': 0.8243243243243243,
         'support': 74.0},
 '2.0': {'f1-score': 0.762589928057554,
         'precision': 0.654320987654321,
         'recall': 0.9137931034482759,
         'support': 58.0},
 '3.0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 21.0},
 '4.0': {'f1-score': 0.7142857142857143,
         'precision': 0.5555555555555556,
         'recall': 1.0,
         'support': 40.0},
 '5.0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 23.0},
 'accuracy': 0.72,
 'macro avg': {'f1-score': 0.5503516806576662,
               'precision': 0.5271669238683128,
               'recall': 0.6045010527769149,
               'support': 225.0},
 'weighted avg': {'f1-score': 0.6519667666183702,
                  'precision': 0.6209070644718793,

In [74]:
for (
    estimator_name,
    best_estimator_and_best_param_grid,
) in best_estimators_and_best_param_grids.items():
    (best_estimator, best_param_grid) = best_estimator_and_best_param_grid
    # Evaluate the model on the test data
    accuracy = best_estimator.score(X_val, y_val)

    y_pred = best_estimator.predict(X_val)

    report = classification_report(y_val, y_pred)
    report_dict = classification_report(y_val, y_pred, output_dict=True)
    class_report_dict = dict()
    report_renaming_map = {
        "0.0": "certain",
        "1.0": "likely",
        "2.0": "possible",
        "3.0": "unlikely",
        "4.0": "unclassified",
        "5.0": "unclassifiable",
    }

    for key in report_dict.keys():
        if key in report_renaming_map.keys():
            class_report_dict[report_renaming_map[key]] = report_dict[key]

    with mlflow.start_run(run_name=estimator_name):
        mlflow.log_param("estimator_name", estimator_name)
        mlflow.log_params(param_grid)
        mlflow.log_metrics(
            {
                "accuracy": report_dict["accuracy"],
                "macro_avg_f1-score": report_dict["macro avg"]["f1-score"],
                "recall_class_certain": class_report_dict["certain"]["recall"],
                "recall_class_likely": class_report_dict["likely"]["recall"],
                "recall_class_possible": class_report_dict["possible"]["recall"],
                "recall_class_unlikely": class_report_dict["unlikely"]["recall"],
                "recall_class_unclassified": class_report_dict["unclassified"][
                    "recall"
                ],
                "recall_class_unclassifiable": class_report_dict["unclassifiable"][
                    "recall"
                ],
            }
        )
        # mlflow.log_metrics(repor)
        mlflow.sklearn.log_model(best_estimator, estimator_name)



🏃 View run decision_tree at: http://127.0.0.1:8081/#/experiments/1/runs/801c3ffbf89b4d499c97c49fec071fce
🧪 View experiment at: http://127.0.0.1:8081/#/experiments/1




🏃 View run ada_boost at: http://127.0.0.1:8081/#/experiments/1/runs/9a317100cb694cf193d1f375bd222e84
🧪 View experiment at: http://127.0.0.1:8081/#/experiments/1


# Export Model


In [70]:
import os

import joblib
from dotenv import load_dotenv

load_dotenv()

ML_MODEL_PATH = os.getenv("ML_MODEL_PATH")
joblib.dump(best_estimators_and_best_param_grids["decision_tree"][0], ML_MODEL_PATH)

['ml_model.pkl']