## Title
Models

### By:
Juan Gómez

### Date:
2024-05-18

### Description:

Train and evaluate text classification models using preprocessed features. Includes data split, cross-validation, performance metrics, learning curves, scalability plots, and feature importance analysis. Final section builds the MDT and training prediction pipeline.

## Import  libraries

In [1]:
import json

import pandas as pd
from feature_engine.selection import (
    DropConstantFeatures,
    DropCorrelatedFeatures,
    SelectBySingleFeaturePerformance,
)

# import numpy as np
from loguru import logger
from sklearn.base import BaseEstimator, TransformerMixin

# from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

# from sklearn.metrics import classification_report
# from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

## Load data

In [2]:
from pathlib import Path

pd.set_option("display.max_columns", None)

BASE_DIR = Path.cwd().resolve().parents[1]

In [3]:
df = pd.read_parquet(BASE_DIR / "data/04_feature/review_user_business_mit.parquet")

In [4]:
df.sample(5)

Unnamed: 0,stars,useful,funny,cool,review_count,stars_business,review_count_business,is_open,elite_count,city_freq,state_freq,is_useful,is_funny,is_cool,review_count_level,useful_user_level,funny_user_level,cool_user_level,fans_level,text_length,word_count,has_exclamation,main_category_group,category_count,review_year,review_month,review_dayofweek,is_weekend,review_quarter,text_length_avg_by_main_category_group,text_length_std_by_main_category_group,text_length_relative_to_avg,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,embedding_34,embedding_35,embedding_36,embedding_37,embedding_38,embedding_39,embedding_40,embedding_41,embedding_42,embedding_43,embedding_44,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49
276879,4,3,0,2,138,4.5,876,True,1,0.045804,0.104635,True,False,True,2,2,2,2,1,1259,244,False,restaurant,8,2011,8,6,True,3,787.809448,648.105408,471.190552,-0.086642,0.548738,-0.447316,-0.362061,-0.246907,-0.265511,0.251696,-0.286597,0.258653,-0.179034,-0.409119,-0.004234,0.083558,0.155845,0.312849,-0.035509,-1.055963,0.021025,-0.114997,-0.084123,-0.302519,-0.866283,0.088682,0.061511,0.036408,0.175465,-0.197008,-1.16387,-0.132697,-0.076242,-0.226712,-0.524034,0.067326,-0.140946,0.036013,-0.096838,0.060031,-0.073701,0.587753,-0.333731,0.13401,-0.669846,0.102267,-0.513776,0.857993,0.21737,0.237328,-0.034687,-0.31935,-0.077261
726737,4,1,0,0,9,4.0,38,False,1,0.090823,0.108327,True,False,False,0,0,1,1,1,440,84,False,other,6,2014,2,0,False,1,792.076538,669.525146,-352.076508,0.112549,-0.325783,0.265731,-0.138733,-0.529961,0.123427,0.188849,-0.129084,0.295666,-0.239363,-0.115312,0.111959,-0.261391,0.123911,0.313873,0.019381,-0.628802,-0.20843,0.015667,-0.26052,0.317312,-0.321672,-0.402087,-0.145037,-0.306901,0.076614,0.379208,0.059067,-0.042827,-0.6048,0.055504,-0.107125,0.353556,0.26401,0.417613,0.031565,0.350237,-0.31744,-0.007403,-0.056709,-0.141627,-0.286754,0.246001,-0.911082,-0.00533,-0.020096,0.053616,0.424937,-0.2332,-0.214634
102375,5,3,2,2,225,2.5,53,False,1,0.001716,0.303103,True,True,True,2,2,3,2,1,1016,183,True,restaurant,2,2010,1,0,False,1,787.809448,648.105408,228.190552,-0.197426,-0.007818,-0.080071,-0.212557,-0.345214,0.502109,0.512161,-0.150226,0.216049,-0.250523,-0.316351,0.06458,-0.071787,-0.154347,0.143049,0.030816,-0.439127,0.135091,0.306084,-0.545142,-0.199356,-0.583161,0.008489,-0.505401,0.433076,0.513912,0.639562,-0.200954,-0.360455,-0.326429,-0.624068,-0.175758,0.121777,-0.192222,0.000527,-0.411243,-0.078579,-0.262587,-0.144075,0.044677,0.19908,-0.099583,0.077884,-0.676069,-0.049472,-0.266725,0.132758,-0.269728,-0.219697,-0.267889
204415,4,1,3,1,12,3.0,31,True,1,0.006053,0.071378,True,True,True,0,1,2,1,1,1336,252,True,health,2,2011,2,4,False,1,816.443298,678.687256,519.556702,0.036013,0.496528,0.35714,-0.46172,-0.250481,0.075125,0.032469,0.384977,-0.022764,-0.248265,-0.025806,0.298655,0.155594,0.111343,-0.315583,-0.469604,-0.532354,0.128115,0.212272,-0.16147,-0.155819,-0.365325,-0.064302,0.294486,0.291706,0.272639,0.377103,0.089165,-0.137275,0.162457,-0.151432,-0.227157,0.158896,0.189598,0.75705,0.155915,0.811598,-0.02905,-0.013852,0.010679,0.312779,-0.229327,0.121547,-0.349078,0.531073,0.063483,-0.073898,-0.522627,-0.064847,-0.145941
279413,5,3,2,2,785,4.5,276,True,11,0.007711,0.053146,True,True,True,3,3,3,3,1,1789,327,True,other,3,2011,9,6,True,3,792.076538,669.525146,996.923462,-0.285691,0.029335,0.049437,-0.473171,-0.59982,0.139298,0.148277,0.038193,0.21369,-0.105916,-0.127871,0.1092,0.289235,0.157022,-0.169181,-0.371647,-0.778507,-0.193937,-0.109892,-0.488717,0.246053,0.04768,-0.373245,-0.782864,-0.209304,0.252734,0.776606,0.023087,-0.397724,-0.478292,-0.173921,-0.034099,0.944239,-0.087435,0.646218,-0.40641,0.251695,-0.087043,0.026902,-0.264051,0.46446,-0.213909,-0.026781,-0.119888,0.49942,-0.058216,0.388586,0.043326,0.080166,0.017668


In [5]:
df.dtypes.value_counts()

float32     53
int32       16
bool         6
int64        3
float64      3
category     1
Name: count, dtype: int64

# Models

## Create target variable

- DropColumnsTransformer

In [7]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")

    def set_output(self, *, transform=None):
        return self

- TargetFromStarsTransformer

In [None]:
class TargetFromStarsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column="remainder__stars"):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        NEGATIVE_THRESHOLD = 2
        NEUTRAL_VALUE = 3

        def classify(stars):
            if stars <= NEGATIVE_THRESHOLD:
                return 0  # negative
            elif stars == NEUTRAL_VALUE:
                return 1  # neutral
            else:
                return 2  # positive

        X["target"] = X[self.column].apply(classify)
        X.drop(columns=[self.column], inplace=True)
        return X

    def set_output(self, *, transform=None):
        return self

In [9]:
create_target_pipe = Pipeline(
    [
        ("create_target", TargetFromStarsTransformer(column="remainder__stars")),
        ("drop_stars", DropColumnsTransformer(columns=["remainder__stars"])),
    ]
)

In [10]:
# df_target = create_target_pipe.fit_transform(df)

## Data Split

In [None]:
# def split_data(df, target_column="target", test_size=0.2, random_state=42):
#     y = df[target_column]
#     X = df.drop(columns=[target_column])

#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=test_size, stratify=y, random_state=random_state
#     )

#     return (
#         X_train.reset_index(drop=True),
#         X_test.reset_index(drop=True),
#         y_train.reset_index(drop=True),
#         y_test.reset_index(drop=True),
#     )

In [12]:
# X_train, X_test, y_train, y_test = split_data(df_target)

## Model-Dependent Transformation

### Encode categorical features

In [13]:
# def transform_with_names(pipeline, X, encode_cols):
#     X_enc = pipeline.transform(X)
#     onehot_cols = pipeline.named_transformers_["onehot"].get_feature_names_out(encode_cols)
#     bool_cols = pipeline.transformers_[1][2]
#     passthrough_cols = [col for col in X.columns if col not in encode_cols + bool_cols]

#     all_columns = list(onehot_cols) + bool_cols + passthrough_cols
#     return pd.DataFrame(X_enc, columns=all_columns, index=X.index)

In [14]:
# categorical_str_object_cols = X_train.select_dtypes(
#     include=["category", "string", "object"]
# ).columns.tolist()
# boolean_cols = X_train.select_dtypes(include=["bool"]).columns.tolist()

In [15]:
# # 1. Step
# bool_to_int_transformer = FunctionTransformer(lambda X: X.astype(np.int8), validate=False)

# # 2. Step
# encoding_mdt_pipe = ColumnTransformer(
#     transformers=[
#         (
#             "onehot",
#             OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
#             categorical_str_object_cols,
#         ),
#         ("bool_int", bool_to_int_transformer, boolean_cols),
#     ],
#     remainder="passthrough",
#     force_int_remainder_cols=False,
# )

- Encoding Test

In [16]:
# encoding_mdt_pipe.fit(X_train)
# X_train_encoded = transform_with_names(
#     encoding_mdt_pipe, X_train, categorical_str_object_cols
# )

### Impute missing values

In [17]:
class GroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.group_means_ = {}
        self.y_ = None

    def fit(self, X, y):
        X = pd.DataFrame(X).copy()
        self.y_ = pd.Series(y).reset_index(drop=True)

        self.group_means_ = {
            col: X[col].groupby(self.y_).mean().to_dict() for col in self.columns
        }
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy().reset_index(drop=True)
        y = self.y_

        for col in self.columns:
            means = self.group_means_[col]
            X[col] = X[col].where(~X[col].isna(), y.map(means))
        return X

    def set_output(self, *, transform=None):
        return self

In [18]:
def get_columns_with_na(X):
    return X.columns[X.isnull().any()].tolist()

In [19]:
columns_with_na = get_columns_with_na(df)

impute_missing_mdt_pipe = Pipeline(
    [("imputation", GroupMeanImputer(columns=columns_with_na))]
)

- Impute missing values Test

In [20]:
# X_train_imputed = impute_missing_mdt_pipe.fit_transform(X_train_encoded, y_train)

### Scale or normalize features

In [21]:
scaling_mdt_pipe = Pipeline([("minmax_scaler", MinMaxScaler())])

- Scale or normalize features Test

In [22]:
# X_train_scaled1 = scaling_mdt_pipe.fit_transform(X_train_imputed)
# X_train_scaled = pd.DataFrame(
#     X_train_scaled1, columns=X_train_imputed.columns, index=X_train_imputed.index
# )

### Dimensionality reduction

In [23]:
dimensionality_reduction_mdt_pipe = Pipeline(
    [
        ("drop_constant", DropConstantFeatures()),
        ("drop_correlated", DropCorrelatedFeatures(threshold=0.9)),
        (
            "target_selector",
            SelectBySingleFeaturePerformance(
                estimator=RandomForestClassifier(
                    n_estimators=50, random_state=42, n_jobs=-1
                ),
                scoring="f1_weighted",
                cv=3,
                threshold=0.01,
            ),
        ),
        (
            "sequential_selector",
            SequentialFeatureSelector(
                estimator=RidgeClassifier(),
                n_features_to_select=50,
                direction="forward",
                n_jobs=1,
            ),
        ),
    ]
)

- Dimensionality reduction Test

In [24]:
# X_train_reduced = dimensionality_reduction_mdt_pipe.fit_transform(
#     X_train_scaled, y_train
# )

In [25]:
# X_train_reduced_df = pd.DataFrame(
#     X_train_reduced,
#     columns=dimensionality_reduction_mdt_pipe.named_steps[
#         "sequential_selector"
#     ].get_feature_names_out(),
#     index=X_train_scaled.index,
# )

### Pipeline

In [26]:
# mdt_preprocessor = Pipeline(
#     steps=[
#         ("encoding_mdt_pipe", encoding_mdt_pipe),
#         ("impute_missing_mdt_pipe", impute_missing_mdt_pipe),
#         ("scaling_mdt_pipe", scaling_mdt_pipe),
#         ("dimensionality_reduction_mdt_pipe", dimensionality_reduction_mdt_pipe),
#     ]
# )

## Training

In [27]:
X_train_reduced = pd.read_parquet(BASE_DIR / "data/05_model_input/X_train.parquet")

### Base model + AutoML

In [28]:
# def summarize_classification(y_true, y_pred, model_name):
#     return {
#         "model": model_name,
#         "accuracy": accuracy_score(y_true, y_pred),
#         "precision_macro": precision_score(y_true, y_pred, average="macro"),
#         "recall_macro": recall_score(y_true, y_pred, average="macro"),
#         "f1_macro": f1_score(y_true, y_pred, average="macro"),
#     }

In [29]:
# df_models = X_train_reduced.copy()
# df_models["target"] = y_train.values

In [30]:
# X_train_cv, X_val_cv, y_train_cv, y_val_cv = split_data(df_models)

In [31]:
# model_candidates = {
#     "logistic": LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42),
#     "decision_tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
#     "xgboost": XGBClassifier(
#         objective="multi:softmax",
#         num_class=3,
#         eval_metric="mlogloss",
#         use_label_encoder=False,
#         random_state=42,
#         verbosity=0,
#     )
# }

In [32]:
# results = []

In [33]:
# for name, model in model_candidates.items():
#     model.fit(X_train_cv, y_train_cv)
#     y_pred = model.predict(X_val_cv)
#     results.append(summarize_classification(y_val_cv, y_pred, model_name=name))

In [34]:
# automl = AutoML()
# automl.fit(
#     X_train=X_train_cv, y_train=y_train_cv, task="classification", time_budget=60
# )
# y_pred_automl = automl.predict(X_val_cv)
# results.append(summarize_classification(y_val_cv, y_pred_automl, model_name="flaml"))

In [35]:
# results_df = pd.DataFrame(results).set_index("model")
# display(results_df.sort_values("f1_macro", ascending=False))

### Hyperparameter tuning

In [36]:
# def fit_grid_search(model, param_grid, X, y, scoring="f1_macro", cv=5, verbose=1):
#     return GridSearchCV(
#         estimator=model,
#         param_grid=param_grid,
#         scoring=scoring,
#         cv=cv,
#         n_jobs=-1,
#         return_train_score=True,
#         verbose=verbose,
#     ).fit(X, y)

In [37]:
# def summarize_grid_search(grid, scoring="f1_macro"):
#     print(f"Best {scoring}: {grid.best_score_:.4f}")
#     print(f"Best params: {grid.best_params_}")

In [38]:
# def grid_search_to_df(grid):
#     return pd.DataFrame(grid.cv_results_).sort_values(
#         "mean_test_score", ascending=False
#     )

- Apply Grid Search

In [39]:
# model = XGBClassifier(
#     objective="multi:softmax",
#     num_class=3,
#     use_label_encoder=False,
#     eval_metric="mlogloss",
#     random_state=42,
#     verbosity=0,
# )

# param_grid = {
#     "max_depth": [3, 5, 7],
#     "learning_rate": [0.01, 0.1, 0.3],
#     "n_estimators": [50, 100],
#     "subsample": [0.8, 1.0],
# }

In [40]:
# # Step 1: training
# grid = fit_grid_search(model, param_grid, X_train_reduced, y_train)

In [41]:
# # Step 2: summary
# summarize_grid_search(grid)

In [42]:
# # Step 3: df results
# df_grid = grid_search_to_df(grid)
# df_grid.head(5)

In [43]:
best_model_name_path = BASE_DIR / "data/06_models/best_model_name.txt"
# with open(best_model_name_path, "w") as f:
#     f.write("XGBClassifier")

In [44]:
best_params_path = BASE_DIR / "data/06_models/best_params.json"
# with open(best_params_path, "w") as f:
#     json.dump(grid.best_params_, f, indent=4)

### Pipeline final training

In [45]:
classifier_fn = best_model_name_path.read_text().strip()
with open(best_params_path) as f:
    best_params = json.load(f)

In [46]:
class TrainModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, classifier_fn, best_params):
        self.classifier_fn = classifier_fn
        self.best_params = best_params

    def fit(self, X, y):
        self.model_ = self.classifier_fn(**self.best_params)
        self.model_.fit(X, y)
        return self

    def transform(self, X):
        return X

    def set_output(self, *, transform=None):
        return self

In [47]:
training_preprocessor = Pipeline(
    [
        (
            "train_model",
            TrainModelTransformer(classifier_fn=XGBClassifier, best_params=best_params),
        )
    ]
)

- Pipeline final training Test

In [48]:
# # training_preprocessor.fit(X_train_reduced, y_train)
# model = training_preprocessor.named_steps["train_model"].model_

## Validation

### Pipeline

In [49]:
# class ValidateModelTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, model, y_true, thresholds=None):
#         self.model = model
#         self.y_true = y_true
#         self.thresholds = thresholds or {"f1_macro": 0.6}

#     def fit(self, X, y=None):
#         y_pred = self.model.predict(X)
#         self.report_ = classification_report(self.y_true, y_pred, output_dict=True)

#         for metric, threshold in self.thresholds.items():
#             score = self.report_["macro avg"].get(metric.replace("_macro", ""), None)
#             if score is not None and score < threshold:
#                 raise ValueError(f"[FAIL] {metric}={score:.4f} < threshold={threshold}")

#         return self

#     def transform(self, X):
#         return self.model

#     def set_output(self, *, transform=None):
#         return self

In [50]:
# validate_preprocessor = Pipeline(
#     [
#         (
#             "validate_model",
#             ValidateModelTransformer(
#                 model=model,
#                 y_true=y_test,
#                 thresholds={"f1_macro": 0.6, "recall_macro": 0.6},
#             ),
#         )
#     ]
# )

# Test Training

In [51]:
import os

os.chdir("/Users/agomezj/Desktop/Juan-G/ml-message-classifier/")
print(os.getcwd())

/Users/agomezj/Desktop/Juan-G/ml-message-classifier


In [52]:
mit = pd.read_parquet(BASE_DIR / "data/04_feature/review_user_business_mit.parquet")

In [54]:
from src.model.mdt import split_data, transform_stars_to_target
from src.model.training import TrainModelTransformer
from src.model.validation import evaluate_and_save_model
from src.pipelines.training_pipeline.training_pipeline import training_pipeline
from src.utils.io_utils import save_pipeline_if_needed

In [None]:
# Target
target = transform_stars_to_target(mit, "stars")

[32m2025-05-21 00:17:42.566[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform_stars_to_target[0m:[36m37[0m - [1mTransforming stars column into target column.[0m


In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(
    target,
    "target",
    0.2,
)

[32m2025-05-21 00:17:43.418[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36msplit_data[0m:[36m70[0m - [1mSplitting data into train and test sets.[0m


In [None]:
# MDT train
X_train_mdt = training_pipeline.named_steps["mdt"].fit_transform(X_train, y_train)

[32m2025-05-21 00:16:44.528[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m415[0m - [1mFitting MDTYelpData...[0m
[32m2025-05-21 00:16:44.543[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform[0m:[36m136[0m - [1mTransforming data with EncodingTransformer.[0m
[32m2025-05-21 00:16:45.046[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m182[0m - [1mFitting GroupMeanImputer.[0m
[32m2025-05-21 00:16:45.610[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform[0m:[36m201[0m - [1mTransforming data with GroupMeanImputer.[0m
[32m2025-05-21 00:16:45.867[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m231[0m - [1mFitting ScalerTransformer.[0m
[32m2025-05-21 00:16:46.327[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mtransform[0m:[36m247[0m - [1mTransforming data with ScalerTransformer.[0m
[32m2025-05-21 00:16:48.724[0m | [1mINFO    [0m | [36msrc.model.mdt[0m:[36mfit[0m:[36m294[0

In [None]:
# MDT test
X_test_mdt = training_pipeline.named_steps["mdt"].transform(X_test)

In [None]:
# Save pipe MDT
mdt = training_pipeline.named_steps["mdt"]
save_pipeline_if_needed(mdt, "models/training_mdt_transformer.pkl")

In [None]:
# Training

# 1. Load best model and best params
classifier_fn = best_model_name_path.read_text().strip()
with open(best_params_path) as f:
    best_params = json.load(f)

# 2. Trainer
trainer = TrainModelTransformer(
    classifier_fn=classifier_fn,
    best_params=best_params,
)
trainer.fit(X_train_mdt, y_train)

In [None]:
# Validator
try:
    evaluate_and_save_model(
        model=training_pipeline.named_steps["training"].model_,
        X_test=X_test_mdt,
        y_test=y_test,
        thresholds={"f1_macro": 0.6, "recall_macro": 0.6},
        output_path="models/best_model.pkl",
    )
except ValueError as e:
    logger.warning(f"Model validation failed: {e}")