In [1]:
import numpy as np

In [2]:
data_train = np.load("../data/train.npz", allow_pickle=True)
data_val = np.load("../data/val.npz", allow_pickle=True)
data_test = np.load("../data/test.npz", allow_pickle=True)

X_train, y_train = data_train["X"], data_train["y"]
X_val, y_val = data_val["X"], data_val["y"]
X_test, y_test = data_test["X"], data_test["y"]

In [3]:
y_train_enc = (y_train == "MS – Ministério da Saúde").astype(np.int32)
y_val_enc = (y_val == "MS – Ministério da Saúde").astype(np.int32)
y_test_enc = (y_test == "MS – Ministério da Saúde").astype(np.int32)

In [4]:
import time

# LightGBM classifier, a gradient boosting framework that uses tree-based learning algorithms
from lightgbm import LGBMClassifier

# CalibratedClassifierCV for probability calibration of classifiers
from sklearn.calibration import CalibratedClassifierCV

# Ensemble classifiers from scikit-learn
# ExtraTreesClassifier and RandomForestClassifier are ensemble methods that use multiple decision trees
# StackingClassifier allows combining multiple classifiers to improve performance
from sklearn.ensemble import (
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
)

# Linear models from scikit-learn
# LogisticRegression is a linear model for binary classification
# SGDClassifier is a linear classifier using stochastic gradient descent
from sklearn.linear_model import LogisticRegression, SGDClassifier

# Metrics for evaluating classification performance
# accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, matthews_corrcoef
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    matthews_corrcoef,
)

# Naive Bayes classifier for multinomially distributed data
from sklearn.naive_bayes import MultinomialNB

# K-Nearest Neighbors classifier
from sklearn.neighbors import KNeighborsClassifier

# Neural network-based classifier
from sklearn.neural_network import MLPClassifier

# Support Vector Machine classifiers
# SVC is a support vector classifier with a non-linear kernel
# LinearSVC is a support vector classifier with a linear kernel
from sklearn.svm import SVC, LinearSVC

# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# XGBoost classifier, an optimized distributed gradient boosting library
from xgboost import XGBClassifier

In [5]:
import pandas as pd
from typing import List, Tuple


def calculate_evaluation_metrics(
    y_true: pd.Series, y_pred: pd.Series
):
    """
    Calculate evaluation metrics for model predictions.

    Args:
        y_true (pd.Series): The true labels.
        y_pred (pd.Series): The predicted labels.

    Returns:
        Tuple[float, float, float, str, float, np.ndarray]: The calculated metrics including F1 score, balanced accuracy, accuracy, classification report, Matthews correlation coefficient, and confusion matrix.
    """
    f1 = f1_score(y_true, y_pred, average="micro")
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    classification_report_str = classification_report(y_true, y_pred)
    matthews_corr_coeff = matthews_corrcoef(y_true, y_pred)
    confusion_matrix_arr = confusion_matrix(y_true, y_pred)

    return (
        f1,
        balanced_accuracy,
        accuracy,
        classification_report_str,
        matthews_corr_coeff,
        confusion_matrix_arr,
    )


def train_and_evaluate_models(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_valid: pd.DataFrame,
    y_valid: pd.Series,
    n_jobs: int = -1,
):
    """
    Train multiple models and evaluate their performance.

    Args:
        X_train (pd.DataFrame): The training data.
        y_train (pd.Series): The training labels.
        X_valid (pd.DataFrame): The validation data.
        y_valid (pd.Series): The validation labels.
        n_jobs (int, optional): The number of jobs to run in parallel. Defaults to -1.

    Returns:
        Tuple[pd.DataFrame, List[List]]: A dataframe of the evaluation results and a list of classification reports.
    """
    # Define the models to be trained
    models = [
        (
            "Calibrated-LSVC",
            CalibratedClassifierCV(
                LinearSVC(random_state=271828, class_weight="balanced", dual="auto")
            ),
        ),
        (
            "LR",
            LogisticRegression(
                random_state=271828, n_jobs=n_jobs, class_weight="balanced"
            ),
        ),
        (
            "RF",
            RandomForestClassifier(
                random_state=271828, n_jobs=n_jobs, class_weight="balanced"
            ),
        ),
        (
            "LGBM",
            LGBMClassifier(
                random_state=271828, n_jobs=n_jobs, class_weight="balanced", verbose=-1
            ),
        ),
        (
            "XGB",
            XGBClassifier(
                random_state=271828, n_jobs=n_jobs, class_weight="balanced", verbosity=0
            ),
        ),
        ("MLP", MLPClassifier(random_state=271828)),
        (
            "SGD",
            SGDClassifier(random_state=271828, n_jobs=n_jobs, class_weight="balanced"),
        ),
        ("NB", MultinomialNB()),
        ("LSVC", LinearSVC(random_state=271828, class_weight="balanced", dual="auto")),
        ("KNN", KNeighborsClassifier(n_jobs=n_jobs)),
        ("DT", DecisionTreeClassifier(random_state=271828, class_weight="balanced")),
        (
            "ExtraTrees",
            ExtraTreesClassifier(
                random_state=271828, n_jobs=n_jobs, class_weight="balanced"
            ),
        ),
    ]

    evaluation_results = []
    classification_reports = []

    # Train each model and evaluate its performance
    for model_name, model in models:
        start_time = time.time()  # Record the start time

        try:
            # Train the model
            model.fit(X_train, y_train)
            # Make predictions on the validation set
            predictions = model.predict(X_valid)
        except Exception as e:
            # Handle any exceptions that occur during training or prediction
            print(f"Error {model_name} - {e}")
            continue

        # Calculate evaluation metrics
        (
            f1,
            balanced_accuracy,
            accuracy,
            classification_report_str,
            matthews_corr_coeff,
            confusion_matrix_arr,
        ) = calculate_evaluation_metrics(y_valid, predictions)
        # Store the classification report and confusion matrix
        classification_reports.append(
            [model_name, classification_report_str, confusion_matrix_arr]
        )

        elapsed_time = time.time() - start_time  # Calculate the elapsed time
        # Append the evaluation results
        evaluation_results.append(
            [
                model_name,
                f1,
                balanced_accuracy,
                accuracy,
                matthews_corr_coeff,
                elapsed_time,
                confusion_matrix_arr,
                classification_report_str,
            ]
        )

        # Print the evaluation results
        print(
            f"Name: {model_name} - F1: {f1:.4f} - BACC: {balanced_accuracy:.4f} - ACC: {accuracy:.4f} - MCC: {matthews_corr_coeff:.4f} - Elapsed: {elapsed_time:.2f}s"
        )
        print(classification_report_str)
        print(confusion_matrix_arr)
        print("*" * 20, "\n")

    # Create a DataFrame to store the evaluation results
    results_df = pd.DataFrame(
        evaluation_results,
        columns=[
            "Model",
            "F1",
            "BACC",
            "ACC",
            "MCC",
            "Total Time",
            "Confusion Matrix",
            "Classification Report",
        ],
    )
    # Convert the confusion matrix to a string for better readability in the DataFrame
    results_df["Confusion Matrix"] = results_df["Confusion Matrix"].apply(
        lambda x: str(x)
    )

    return models, results_df, classification_reports

In [None]:
models, df_results, creports = train_and_evaluate_models(
    X_train, y_train_enc, X_val, y_val_enc, n_jobs=-1
)

Name: Calibrated-LSVC - F1: 0.9630 - BACC: 0.6812 - ACC: 0.9630 - MCC: 0.5061 - Elapsed: 57.40s
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     81917
           1       0.74      0.37      0.49      4181

    accuracy                           0.96     86098
   macro avg       0.85      0.68      0.74     86098
weighted avg       0.96      0.96      0.96     86098

[[81370   547]
 [ 2638  1543]]
******************** 





Name: LR - F1: 0.8614 - BACC: 0.8366 - ACC: 0.8614 - MCC: 0.3864 - Elapsed: 6.10s
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     81917
           1       0.23      0.81      0.36      4181

    accuracy                           0.86     86098
   macro avg       0.61      0.84      0.64     86098
weighted avg       0.95      0.86      0.90     86098

[[70780 11137]
 [  798  3383]]
******************** 

Name: RF - F1: 0.9674 - BACC: 0.7411 - ACC: 0.9674 - MCC: 0.5914 - Elapsed: 208.92s
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     81917
           1       0.75      0.49      0.59      4181

    accuracy                           0.97     86098
   macro avg       0.86      0.74      0.79     86098
weighted avg       0.96      0.97      0.96     86098

[[81236   681]
 [ 2130  2051]]
******************** 





Name: LGBM - F1: 0.8865 - BACC: 0.8446 - ACC: 0.8865 - MCC: 0.4239 - Elapsed: 7.62s
              precision    recall  f1-score   support

           0       0.99      0.89      0.94     81917
           1       0.27      0.80      0.41      4181

    accuracy                           0.89     86098
   macro avg       0.63      0.84      0.67     86098
weighted avg       0.95      0.89      0.91     86098

[[72992  8925]
 [  844  3337]]
******************** 

Name: XGB - F1: 0.9671 - BACC: 0.7285 - ACC: 0.9671 - MCC: 0.5816 - Elapsed: 18.68s
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     81917
           1       0.77      0.46      0.58      4181

    accuracy                           0.97     86098
   macro avg       0.87      0.73      0.78     86098
weighted avg       0.96      0.97      0.96     86098

[[81327   590]
 [ 2240  1941]]
******************** 

Name: MLP - F1: 0.9593 - BACC: 0.7713 - ACC: 0.9593 - MCC: 0.5519 - El

In [22]:
df_results.sort_values(by="MCC", ascending=False)

Unnamed: 0,Model,F1,BACC,ACC,MCC,Total Time,Confusion Matrix,Classification Report
2,RF,0.967351,0.74112,0.967351,0.591364,208.920608,[[81236 681]\n [ 2130 2051]],precision recall f1-score ...
4,XGB,0.96713,0.72852,0.96713,0.581592,18.68109,[[81327 590]\n [ 2240 1941]],precision recall f1-score ...
11,ExtraTrees,0.965191,0.722167,0.965191,0.557685,404.565088,[[81207 710]\n [ 2287 1894]],precision recall f1-score ...
5,MLP,0.959279,0.771263,0.959279,0.551893,1479.776974,[[80238 1679]\n [ 1827 2354]],precision recall f1-score ...
0,Calibrated-LSVC,0.963007,0.681186,0.963007,0.506115,57.401644,[[81370 547]\n [ 2638 1543]],precision recall f1-score ...
3,LGBM,0.886536,0.844591,0.886536,0.423884,7.619628,[[72992 8925]\n [ 844 3337]],precision recall f1-score ...
10,DT,0.934981,0.74794,0.934981,0.420602,832.289294,[[78239 3678]\n [ 1920 2261]],precision recall f1-score ...
6,SGD,0.875746,0.828821,0.875746,0.394651,12.927703,[[72152 9765]\n [ 933 3248]],precision recall f1-score ...
8,LSVC,0.86232,0.83595,0.86232,0.386784,12.98953,[[70871 11046]\n [ 808 3373]],precision recall f1-score ...
1,LR,0.861379,0.836591,0.861379,0.386444,6.095995,[[70780 11137]\n [ 798 3383]],precision recall f1-score ...


In [None]:
from joblib import dump
import sklearn, numpy as np

classifier = models[0][1]
metadata = {
    "sklearn": sklearn.__version__,
    "numpy": np.__version__
}
dump((classifier, metadata), "../models/lai_ms_model_classifier.joblib")
classifier

0,1,2
,"estimator  estimator: estimator instance, default=None The classifier whose output need to be calibrated to provide more accurate `predict_proba` outputs. The default classifier is a :class:`~sklearn.svm.LinearSVC`. .. versionadded:: 1.2",LinearSVC(cla..._state=271828)
,"method  method: {'sigmoid', 'isotonic', 'temperature'}, default='sigmoid' The method to use for calibration. Can be: - 'sigmoid', which corresponds to Platt's method (i.e. a binary logistic  regression model). - 'isotonic', which is a non-parametric approach. - 'temperature', temperature scaling. Sigmoid and isotonic calibration methods natively support only binary classifiers and extend to multi-class classification using a One-vs-Rest (OvR) strategy with post-hoc renormalization, i.e., adjusting the probabilities after calibration to ensure they sum up to 1. In contrast, temperature scaling naturally supports multi-class calibration by applying `softmax(classifier_logits/T)` with a value of `T` (temperature) that optimizes the log loss. For very uncalibrated classifiers on very imbalanced datasets, sigmoid calibration might be preferred because it fits an additional intercept parameter. This helps shift decision boundaries appropriately when the classifier being calibrated is biased towards the majority class. Isotonic calibration is not recommended when the number of calibration samples is too low ``(≪1000)`` since it then tends to overfit. .. versionchanged:: 1.8  Added option 'temperature'.",'sigmoid'
,"cv  cv: int, cross-validation generator, or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. If ``y`` is neither binary nor multiclass, :class:`~sklearn.model_selection.KFold` is used. Refer to the :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. Base estimator clones are fitted in parallel across cross-validation iterations. See :term:`Glossary ` for more details. .. versionadded:: 0.24",
,"ensemble  ensemble: bool, or ""auto"", default=""auto"" Determines how the calibrator is fitted. ""auto"" will use `False` if the `estimator` is a :class:`~sklearn.frozen.FrozenEstimator`, and `True` otherwise. If `True`, the `estimator` is fitted using training data, and calibrated using testing data, for each `cv` fold. The final estimator is an ensemble of `n_cv` fitted classifier and calibrator pairs, where `n_cv` is the number of cross-validation folds. The output is the average predicted probabilities of all pairs. If `False`, `cv` is used to compute unbiased predictions, via :func:`~sklearn.model_selection.cross_val_predict`, which are then used for calibration. At prediction time, the classifier used is the `estimator` trained on all the data. Note that this method is also internally implemented in :mod:`sklearn.svm` estimators with the `probabilities=True` parameter. .. versionadded:: 0.24 .. versionchanged:: 1.6  `""auto""` option is added and is the default.",'auto'

0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",'balanced'
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0
