In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from types import SimpleNamespace
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

def load_dry_bean_data(path="../data/Dry_Bean.csv"):
    # Read data file
    data_file = pd.read_csv(path)
    data_file.columns = data_file.columns.str.strip()

    # Define features and target
    X = data_file.drop("Class", axis=1)
    y = data_file["Class"]

    # Encode labels
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    # Return SimpleNamespace
    return SimpleNamespace(
        data=X.to_numpy(dtype=np.float32),
        target=y_encoded,
        feature_names=X.columns.tolist(),
        target_names=encoder.classes_,
        label_encoder=encoder
    )

def main():
    # Load dataset
    data = load_dry_bean_data()
    X, y = data.data, data.target

    # Split the data (80% train, 20% test)
    train_X, test_X, train_y, test_y = train_test_split(
        X, y, test_size=0.2, random_state=5, stratify=y
    )

    # Feature Scaling
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)

    # Train models

# Multiclass Logistic Regression
    logistic_regression_model = LogisticRegression(max_iter=2000, solver="lbfgs")
    logistic_regression_model.fit(train_X, train_y)

# Decision Tree
    decision_tree_model = DecisionTreeClassifier(random_state=5)
    decision_tree_model.fit(train_X, train_y)

# KNN
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(train_X, train_y)

# Naive Bayes
    naive_bayes_model = GaussianNB()
    naive_bayes_model.fit(train_X, train_y)

# Random Forest
    random_forest_model = RandomForestClassifier(n_estimators=100, random_state=5)
    random_forest_model.fit(train_X, train_y)

# XGBoost
    xgboost_model = XGBClassifier(
        objective="multi:softprob",
        num_class=len(data.target_names),
        eval_metric="mlogloss",
        random_state=5
    )
    xgboost_model.fit(train_X, train_y)

    # Model list
    models = {
        "Logistic Regression": logistic_regression_model,
        "Decision Tree": decision_tree_model,
        "KNN": knn_model,
        "Naive Bayes": naive_bayes_model,
        "Random Forest": random_forest_model,
        "XGBoost": xgboost_model
    }

    # Evaluate models
    results = []
    for name, model in models.items():
        y_pred = model.predict(test_X)
        y_prob = model.predict_proba(test_X)

        metrics = {
            "ML Model name": name,
            "Accuracy": accuracy_score(test_y, y_pred),
            "AUC": roc_auc_score(test_y, y_prob, multi_class="ovr", average="macro"),
            "Precision": precision_score(test_y, y_pred, average="macro"),
            "Recall": recall_score(test_y, y_pred, average="macro"),
            "F1": f1_score(test_y, y_pred, average="macro"),
            "MCC": matthews_corrcoef(test_y, y_pred),
        }
        results.append(metrics)

    data_frame_results = pd.DataFrame(results)
    print(data_frame_results)

    # Select best model by Accuracy
    best_row_by_accuracy = data_frame_results.sort_values("Accuracy", ascending=False).iloc[0]
    best_model_name_by_accuracy = best_row_by_accuracy["ML Model name"]
    best_model = models[best_model_name_by_accuracy]
    print(f"\nBest model: {best_model_name_by_accuracy}")

    # Create pickle files
    joblib.dump(best_model, "best_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    joblib.dump(data.label_encoder, "label_encoder.pkl")

    print("Saved pickle files")

if __name__ == "__main__":
    main()
