## Title
Models

### By:
Juan Gómez

### Date:
2024-05-18

### Description:

Train and evaluate text classification models using preprocessed features. Includes data split, cross-validation, performance metrics, learning curves, scalability plots, and feature importance analysis. Final section builds the MDT and training prediction pipeline.

## Import  libraries

In [None]:
import json

import pandas as pd
from feature_engine.selection import (
    DropConstantFeatures,
    DropCorrelatedFeatures,
    SelectBySingleFeaturePerformance,
)

# import numpy as np
from loguru import logger
from sklearn.base import BaseEstimator, TransformerMixin

# from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

# from sklearn.metrics import classification_report
# from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

## Load data

In [2]:
from pathlib import Path

pd.set_option("display.max_columns", None)

BASE_DIR = Path.cwd().resolve().parents[1]

In [None]:
df = pd.read_parquet(BASE_DIR / "data/04_feature/review_user_business_mit.parquet")

In [34]:
df.sample(5)

Unnamed: 0,stars,useful,funny,cool,review_count,stars_business,review_count_business,is_open,elite_count,city_freq,state_freq,is_useful,is_funny,is_cool,review_count_level,useful_user_level,funny_user_level,cool_user_level,fans_level,text_length,word_count,has_exclamation,main_category_group,category_count,review_year,review_month,review_dayofweek,is_weekend,review_quarter,text_length_avg_by_main_category_group,text_length_std_by_main_category_group,text_length_relative_to_avg,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,embedding_34,embedding_35,embedding_36,embedding_37,embedding_38,embedding_39,embedding_40,embedding_41,embedding_42,embedding_43,embedding_44,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49
906192,5,3,0,0,128,4.0,191,True,1,0.003982,0.183707,True,False,False,2,2,2,2,1,640,116,False,restaurant,4,2018,9,6,True,3,639.953857,532.740662,0.046122,0.074751,0.697482,0.311859,-0.521046,-0.194581,0.27639,1.111978,-0.147634,0.095341,0.155567,-0.119884,0.03259,-0.178728,0.004451,-0.036841,-0.095693,0.11006,-0.160406,0.525937,-0.69252,-0.190859,-0.493326,-0.428122,-0.642687,-0.268597,-0.08821,0.689469,-0.948733,-0.344062,-0.419753,-0.429531,-0.379099,0.349651,-0.350816,1.23889,-0.083041,0.483351,-0.161838,-0.053674,-0.307265,0.418604,-0.26805,0.342727,-0.73948,0.501753,0.179687,-0.268822,-0.393494,0.105226,-0.363393
969799,2,1,0,0,10,4.0,32,True,1,0.000827,0.055804,True,False,False,1,0,0,0,0,500,94,False,restaurant,9,2018,7,6,True,3,639.953857,532.740662,-139.953873,-0.309841,0.013298,-0.250818,-0.633023,-0.232362,-0.38545,-0.096385,-0.17621,0.351112,-0.429787,-0.885585,-0.095075,0.033912,-0.037212,-0.017246,-0.861292,0.838786,0.306541,-0.116945,-0.129314,0.615652,-0.030124,0.08685,-0.703382,-0.152619,-0.470992,0.209329,-0.168802,-0.195786,-0.126982,-0.49865,-0.387894,-0.086217,-0.441568,-0.275358,0.394184,0.47145,0.041419,-0.425344,-0.377018,-0.024423,-0.222443,0.771572,0.197692,-0.046282,0.164182,-0.172464,0.574928,-0.097091,0.159368
247608,4,2,0,1,303,5.0,17,False,4,0.015494,0.024722,True,False,True,3,3,3,3,1,582,105,False,restaurant,7,2020,11,6,True,4,639.953857,532.740662,-57.953876,0.271569,0.263023,0.357767,0.102872,-0.50956,-0.177065,0.356231,0.419663,0.179826,-0.436309,-0.249501,0.029521,0.357507,0.057207,0.428971,0.366414,-0.302327,-0.399496,0.056481,-0.275047,-0.211334,-0.502862,-0.251838,-0.165914,0.260661,0.021342,-0.383204,-0.035808,0.288589,0.187426,-0.404876,-0.140675,0.010602,-0.41205,0.420503,-0.099117,0.256436,0.006359,-0.174492,0.135159,0.304199,-0.062886,-0.103389,-0.0196,0.382529,0.111698,0.431777,-0.012114,0.278075,-0.338858
520898,5,1,0,1,67,4.5,440,True,3,0.075552,0.183707,True,False,True,2,2,2,2,1,937,189,True,restaurant,9,2019,10,6,True,4,639.953857,532.740662,297.046112,0.054394,-0.207005,-0.023298,-0.272494,0.064919,0.525586,0.410752,0.551149,-0.189903,-0.251472,0.019422,0.256978,-0.225388,0.400441,0.244852,-0.037545,0.016607,-0.098814,-0.460489,0.329163,-0.389842,-0.422076,0.164976,-0.166697,0.114561,0.398865,0.150982,0.093891,-0.292486,0.224881,0.266993,-0.765948,0.301625,-0.092466,-0.425389,-0.462293,-0.271333,-0.037687,-0.010957,0.489599,0.280509,-0.112499,0.582281,-0.548356,0.135585,-0.65742,0.177071,-0.132898,0.144698,-0.462859
935593,1,1,0,0,7,4.0,15,True,1,0.113044,0.200348,True,False,False,0,0,0,0,0,409,76,False,other,3,2018,8,3,False,3,745.191589,676.977234,-336.191589,-0.014251,-0.015865,0.354668,-0.333855,0.467439,-0.69459,-0.584475,-0.187144,-0.139864,0.032662,0.089488,-0.27128,0.075375,-0.064606,-0.450472,0.160301,0.441326,0.19478,-0.101216,-0.555794,0.022764,0.520888,-0.0001,0.284979,0.199642,0.565771,0.186059,0.527476,-0.003421,-0.176821,0.016447,0.160458,-0.041687,0.516288,-0.294658,0.355451,1.11964,-0.269397,-0.120379,0.361275,0.050111,0.136124,0.15197,-0.22647,-0.032981,-0.171146,-0.093916,-0.340528,-0.219574,-0.477737


In [6]:
df.dtypes.value_counts()

float32           53
int32             20
object             6
bool               6
float64            4
int64              3
datetime64[ns]     1
category           1
category           1
category           1
Name: count, dtype: int64

# Models

## Create target variable

- DropColumnsTransformer

In [7]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")

    def set_output(self, *, transform=None):
        return self

- TargetFromStarsTransformer

In [8]:
class TargetFromStarsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column="remainder__stars"):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        def classify(stars):
            if stars <= 2:
                return 0  # negative
            elif stars == 3:
                return 1  # neutral
            else:
                return 2  # positive

        X["target"] = X[self.column].apply(classify)
        X.drop(columns=[self.column], inplace=True)
        return X

    def set_output(self, *, transform=None):
        return self

In [9]:
create_target_pipe = Pipeline(
    [
        ("create_target", TargetFromStarsTransformer(column="remainder__stars")),
        ("drop_stars", DropColumnsTransformer(columns=["remainder__stars"])),
    ]
)

In [10]:
# df_target = create_target_pipe.fit_transform(df)

## Data Split

In [None]:
# def split_data(df, target_column="target", test_size=0.2, random_state=42):
#     y = df[target_column]
#     X = df.drop(columns=[target_column])

#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=test_size, stratify=y, random_state=random_state
#     )

#     return (
#         X_train.reset_index(drop=True),
#         X_test.reset_index(drop=True),
#         y_train.reset_index(drop=True),
#         y_test.reset_index(drop=True),
#     )

In [12]:
# X_train, X_test, y_train, y_test = split_data(df_target)

## Model-Dependent Transformation

### Encode categorical features

In [13]:
# def transform_with_names(pipeline, X, encode_cols):
#     X_enc = pipeline.transform(X)
#     onehot_cols = pipeline.named_transformers_["onehot"].get_feature_names_out(encode_cols)
#     bool_cols = pipeline.transformers_[1][2]
#     passthrough_cols = [col for col in X.columns if col not in encode_cols + bool_cols]

#     all_columns = list(onehot_cols) + bool_cols + passthrough_cols
#     return pd.DataFrame(X_enc, columns=all_columns, index=X.index)

In [14]:
# categorical_str_object_cols = X_train.select_dtypes(
#     include=["category", "string", "object"]
# ).columns.tolist()
# boolean_cols = X_train.select_dtypes(include=["bool"]).columns.tolist()

In [15]:
# # 1. Step
# bool_to_int_transformer = FunctionTransformer(lambda X: X.astype(np.int8), validate=False)

# # 2. Step
# encoding_mdt_pipe = ColumnTransformer(
#     transformers=[
#         (
#             "onehot",
#             OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
#             categorical_str_object_cols,
#         ),
#         ("bool_int", bool_to_int_transformer, boolean_cols),
#     ],
#     remainder="passthrough",
#     force_int_remainder_cols=False,
# )

- Encoding Test

In [16]:
# encoding_mdt_pipe.fit(X_train)
# X_train_encoded = transform_with_names(
#     encoding_mdt_pipe, X_train, categorical_str_object_cols
# )

### Impute missing values

In [17]:
class GroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.group_means_ = {}
        self.y_ = None

    def fit(self, X, y):
        X = pd.DataFrame(X).copy()
        self.y_ = pd.Series(y).reset_index(drop=True)

        self.group_means_ = {col: X[col].groupby(self.y_).mean().to_dict() for col in self.columns}
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy().reset_index(drop=True)
        y = self.y_

        for col in self.columns:
            means = self.group_means_[col]
            X[col] = X[col].where(~X[col].isna(), y.map(means))
        return X

    def set_output(self, *, transform=None):
        return self

In [18]:
def get_columns_with_na(X):
    return X.columns[X.isnull().any()].tolist()

In [19]:
columns_with_na = get_columns_with_na(df)

impute_missing_mdt_pipe = Pipeline([("imputation", GroupMeanImputer(columns=columns_with_na))])

- Impute missing values Test

In [20]:
# X_train_imputed = impute_missing_mdt_pipe.fit_transform(X_train_encoded, y_train)

### Scale or normalize features

In [21]:
scaling_mdt_pipe = Pipeline([("minmax_scaler", MinMaxScaler())])

- Scale or normalize features Test

In [22]:
# X_train_scaled1 = scaling_mdt_pipe.fit_transform(X_train_imputed)
# X_train_scaled = pd.DataFrame(
#     X_train_scaled1, columns=X_train_imputed.columns, index=X_train_imputed.index
# )

### Dimensionality reduction

In [23]:
dimensionality_reduction_mdt_pipe = Pipeline(
    [
        ("drop_constant", DropConstantFeatures()),
        ("drop_correlated", DropCorrelatedFeatures(threshold=0.9)),
        (
            "target_selector",
            SelectBySingleFeaturePerformance(
                estimator=RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
                scoring="f1_weighted",
                cv=3,
                threshold=0.01,
            ),
        ),
        (
            "sequential_selector",
            SequentialFeatureSelector(
                estimator=RidgeClassifier(),
                n_features_to_select=50,
                direction="forward",
                n_jobs=1,
            ),
        ),
    ]
)

- Dimensionality reduction Test

In [24]:
# X_train_reduced = dimensionality_reduction_mdt_pipe.fit_transform(
#     X_train_scaled, y_train
# )

In [25]:
# X_train_reduced_df = pd.DataFrame(
#     X_train_reduced,
#     columns=dimensionality_reduction_mdt_pipe.named_steps[
#         "sequential_selector"
#     ].get_feature_names_out(),
#     index=X_train_scaled.index,
# )

### Pipeline

In [26]:
# mdt_preprocessor = Pipeline(
#     steps=[
#         ("encoding_mdt_pipe", encoding_mdt_pipe),
#         ("impute_missing_mdt_pipe", impute_missing_mdt_pipe),
#         ("scaling_mdt_pipe", scaling_mdt_pipe),
#         ("dimensionality_reduction_mdt_pipe", dimensionality_reduction_mdt_pipe),
#     ]
# )

## Training

In [27]:
X_train_reduced = pd.read_parquet(BASE_DIR / "data/05_model_input/X_train.parquet")

### Base model + AutoML

In [28]:
# def summarize_classification(y_true, y_pred, model_name):
#     return {
#         "model": model_name,
#         "accuracy": accuracy_score(y_true, y_pred),
#         "precision_macro": precision_score(y_true, y_pred, average="macro"),
#         "recall_macro": recall_score(y_true, y_pred, average="macro"),
#         "f1_macro": f1_score(y_true, y_pred, average="macro"),
#     }

In [29]:
# df_models = X_train_reduced.copy()
# df_models["target"] = y_train.values

In [30]:
# X_train_cv, X_val_cv, y_train_cv, y_val_cv = split_data(df_models)

In [31]:
# model_candidates = {
#     "logistic": LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42),
#     "decision_tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
#     "xgboost": XGBClassifier(
#         objective="multi:softmax",
#         num_class=3,
#         eval_metric="mlogloss",
#         use_label_encoder=False,
#         random_state=42,
#         verbosity=0,
#     )
# }

In [32]:
# results = []

In [33]:
# for name, model in model_candidates.items():
#     model.fit(X_train_cv, y_train_cv)
#     y_pred = model.predict(X_val_cv)
#     results.append(summarize_classification(y_val_cv, y_pred, model_name=name))

In [34]:
# automl = AutoML()
# automl.fit(
#     X_train=X_train_cv, y_train=y_train_cv, task="classification", time_budget=60
# )
# y_pred_automl = automl.predict(X_val_cv)
# results.append(summarize_classification(y_val_cv, y_pred_automl, model_name="flaml"))

In [35]:
# results_df = pd.DataFrame(results).set_index("model")
# display(results_df.sort_values("f1_macro", ascending=False))

### Hyperparameter tuning

In [36]:
# def fit_grid_search(model, param_grid, X, y, scoring="f1_macro", cv=5, verbose=1):
#     return GridSearchCV(
#         estimator=model,
#         param_grid=param_grid,
#         scoring=scoring,
#         cv=cv,
#         n_jobs=-1,
#         return_train_score=True,
#         verbose=verbose,
#     ).fit(X, y)

In [37]:
# def summarize_grid_search(grid, scoring="f1_macro"):
#     print(f"Best {scoring}: {grid.best_score_:.4f}")
#     print(f"Best params: {grid.best_params_}")

In [38]:
# def grid_search_to_df(grid):
#     return pd.DataFrame(grid.cv_results_).sort_values(
#         "mean_test_score", ascending=False
#     )

- Apply Grid Search

In [39]:
# model = XGBClassifier(
#     objective="multi:softmax",
#     num_class=3,
#     use_label_encoder=False,
#     eval_metric="mlogloss",
#     random_state=42,
#     verbosity=0,
# )

# param_grid = {
#     "max_depth": [3, 5, 7],
#     "learning_rate": [0.01, 0.1, 0.3],
#     "n_estimators": [50, 100],
#     "subsample": [0.8, 1.0],
# }

In [40]:
# # Step 1: training
# grid = fit_grid_search(model, param_grid, X_train_reduced, y_train)

In [41]:
# # Step 2: summary
# summarize_grid_search(grid)

In [42]:
# # Step 3: df results
# df_grid = grid_search_to_df(grid)
# df_grid.head(5)

In [43]:
best_model_name_path = BASE_DIR / "data/06_models/best_model_name.txt"
# with open(best_model_name_path, "w") as f:
#     f.write("XGBClassifier")

In [44]:
best_params_path = BASE_DIR / "data/06_models/best_params.json"
# with open(best_params_path, "w") as f:
#     json.dump(grid.best_params_, f, indent=4)

### Pipeline final training

In [45]:
classifier_fn = best_model_name_path.read_text().strip()
with open(best_params_path) as f:
    best_params = json.load(f)

In [46]:
class TrainModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, classifier_fn, best_params):
        self.classifier_fn = classifier_fn
        self.best_params = best_params

    def fit(self, X, y):
        self.model_ = self.classifier_fn(**self.best_params)
        self.model_.fit(X, y)
        return self

    def transform(self, X):
        return X

    def set_output(self, *, transform=None):
        return self

In [47]:
training_preprocessor = Pipeline(
    [
        (
            "train_model",
            TrainModelTransformer(classifier_fn=XGBClassifier, best_params=best_params),
        )
    ]
)

- Pipeline final training Test

In [48]:
# # training_preprocessor.fit(X_train_reduced, y_train)
# model = training_preprocessor.named_steps["train_model"].model_

## Validation

### Pipeline

In [49]:
# class ValidateModelTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, model, y_true, thresholds=None):
#         self.model = model
#         self.y_true = y_true
#         self.thresholds = thresholds or {"f1_macro": 0.6}

#     def fit(self, X, y=None):
#         y_pred = self.model.predict(X)
#         self.report_ = classification_report(self.y_true, y_pred, output_dict=True)

#         for metric, threshold in self.thresholds.items():
#             score = self.report_["macro avg"].get(metric.replace("_macro", ""), None)
#             if score is not None and score < threshold:
#                 raise ValueError(f"[FAIL] {metric}={score:.4f} < threshold={threshold}")

#         return self

#     def transform(self, X):
#         return self.model

#     def set_output(self, *, transform=None):
#         return self

In [50]:
# validate_preprocessor = Pipeline(
#     [
#         (
#             "validate_model",
#             ValidateModelTransformer(
#                 model=model,
#                 y_true=y_test,
#                 thresholds={"f1_macro": 0.6, "recall_macro": 0.6},
#             ),
#         )
#     ]
# )

# Test Training

In [51]:
import os

os.chdir("/Users/agomezj/Desktop/Juan-G/ml-message-classifier/")
print(os.getcwd())

/Users/agomezj/Desktop/Juan-G/ml-message-classifier


In [52]:
mit = pd.read_parquet(BASE_DIR / "data/04_feature/review_user_business_mit.parquet")

In [54]:
from src.model.mdt import split_data, transform_stars_to_target
from src.model.training import TrainModelTransformer
from src.model.validation import evaluate_and_save_model
from src.pipelines.training_pipeline.training_pipeline import training_pipeline
from src.utils.io_utils import save_pipeline_if_needed

In [None]:
# Target
target = transform_stars_to_target(mit, "stars")

[32m2025-05-21 00:17:42.566[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform_stars_to_target[0m:[36m37[0m - [1mTransforming stars column into target column.[0m


In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(
    target,
    "target",
    0.2,
)

[32m2025-05-21 00:17:43.418[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36msplit_data[0m:[36m70[0m - [1mSplitting data into train and test sets.[0m


In [None]:
# MDT train
X_train_mdt = training_pipeline.named_steps["mdt"].fit_transform(X_train, y_train)

[32m2025-05-21 00:16:44.528[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m415[0m - [1mFitting MDTYelpData...[0m
[32m2025-05-21 00:16:44.543[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform[0m:[36m136[0m - [1mTransforming data with EncodingTransformer.[0m
[32m2025-05-21 00:16:45.046[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m182[0m - [1mFitting GroupMeanImputer.[0m
[32m2025-05-21 00:16:45.610[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform[0m:[36m201[0m - [1mTransforming data with GroupMeanImputer.[0m
[32m2025-05-21 00:16:45.867[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m231[0m - [1mFitting ScalerTransformer.[0m
[32m2025-05-21 00:16:46.327[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform[0m:[36m247[0m - [1mTransforming data with ScalerTransformer.[0m
[32m2025-05-21 00:16:48.724[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m294[0

In [None]:
# MDT test
X_test_mdt = training_pipeline.named_steps["mdt"].transform(X_test)

In [None]:
# Save pipe MDT
mdt = training_pipeline.named_steps["mdt"]
save_pipeline_if_needed(mdt, "models/training_mdt_transformer.pkl")

In [None]:
# Training

# 1. Load best model and best params
classifier_fn = best_model_name_path.read_text().strip()
with open(best_params_path) as f:
    best_params = json.load(f)

# 2. Trainer
trainer = TrainModelTransformer(
    classifier_fn=classifier_fn,
    best_params=best_params,
)
trainer.fit(X_train_mdt, y_train)

In [None]:
# Validator
try:
    evaluate_and_save_model(
        model=training_pipeline.named_steps["training"].model_,
        X_test=X_test_mdt,
        y_test=y_test,
        thresholds={"f1_macro": 0.6, "recall_macro": 0.6},
        output_path="models/best_model.pkl",
    )
except ValueError as e:
    logger.warning(f"Model validation failed: {e}")