## Title
Models

### By:
Juan Gómez

### Date:
2024-05-18

### Description:

Train and evaluate text classification models using preprocessed features. Includes data split, cross-validation, performance metrics, learning curves, scalability plots, and feature importance analysis. Final section builds the MDT and training prediction pipeline.

## Import  libraries

In [None]:
import json

import numpy as np
import pandas as pd
from feature_engine.selection import (
    DropConstantFeatures,
    DropCorrelatedFeatures,
    SelectBySingleFeaturePerformance,
)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

## Load data

In [2]:
from pathlib import Path

pd.set_option("display.max_columns", None)

BASE_DIR = Path.cwd().resolve().parents[1]

In [None]:
df = pd.read_parquet(BASE_DIR / "data/04_feature/review_user_business_mit.parquet")

In [4]:
df.sample(5)

Unnamed: 0,cat_agg__elite_count,cat_agg__city_freq,cat_agg__state_freq,num_agg__useful,num_agg__funny,num_agg__cool,num_agg__review_count,num_agg__is_useful,num_agg__is_funny,num_agg__is_cool,num_agg__review_count_level,num_agg__useful_user_level,num_agg__funny_user_level,num_agg__cool_user_level,num_agg__fans_level,str_agg__text_length,str_agg__word_count,str_agg__has_exclamation,str_agg__main_category_group,str_agg__category_count,date_agg__review_year,date_agg__review_month,date_agg__review_dayofweek,date_agg__is_weekend,date_agg__review_quarter,remainder__stars,remainder__is_open,str_agg__avg_text_length_per_category,str_agg__std_text_length_per_category,str_agg__relative_length,str_agg__text_clean_emb_0,str_agg__text_clean_emb_1,str_agg__text_clean_emb_2,str_agg__text_clean_emb_3,str_agg__text_clean_emb_4,str_agg__text_clean_emb_5,str_agg__text_clean_emb_6,str_agg__text_clean_emb_7,str_agg__text_clean_emb_8,str_agg__text_clean_emb_9,str_agg__text_clean_emb_10,str_agg__text_clean_emb_11,str_agg__text_clean_emb_12,str_agg__text_clean_emb_13,str_agg__text_clean_emb_14,str_agg__text_clean_emb_15,str_agg__text_clean_emb_16,str_agg__text_clean_emb_17,str_agg__text_clean_emb_18,str_agg__text_clean_emb_19,str_agg__text_clean_emb_20,str_agg__text_clean_emb_21,str_agg__text_clean_emb_22,str_agg__text_clean_emb_23,str_agg__text_clean_emb_24,str_agg__text_clean_emb_25,str_agg__text_clean_emb_26,str_agg__text_clean_emb_27,str_agg__text_clean_emb_28,str_agg__text_clean_emb_29,str_agg__text_clean_emb_30,str_agg__text_clean_emb_31,str_agg__text_clean_emb_32,str_agg__text_clean_emb_33,str_agg__text_clean_emb_34,str_agg__text_clean_emb_35,str_agg__text_clean_emb_36,str_agg__text_clean_emb_37,str_agg__text_clean_emb_38,str_agg__text_clean_emb_39,str_agg__text_clean_emb_40,str_agg__text_clean_emb_41,str_agg__text_clean_emb_42,str_agg__text_clean_emb_43,str_agg__text_clean_emb_44,str_agg__text_clean_emb_45,str_agg__text_clean_emb_46,str_agg__text_clean_emb_47,str_agg__text_clean_emb_48,str_agg__text_clean_emb_49
282552,0,0.056547,0.07803,1,0,0,84,True,False,False,2,1,1,1,1,346,57,False,restaurant,4,2020,10,3,False,4,5,True,639.955566,532.742432,-293.955597,0.073381,0.401102,0.238601,0.061477,-0.044338,-0.154241,-0.01134,-0.01862,0.289003,-0.084832,-0.291393,0.558012,0.213976,0.229256,0.036858,-0.211612,-0.336396,0.123942,0.200076,-0.098444,0.363149,-1.003926,-0.279715,-0.598841,0.461337,0.230579,-0.297323,-0.789541,-0.111303,-0.123873,0.005592,-0.39823,0.52676,0.312817,-0.963278,-0.420191,-0.130453,-0.353203,-0.19151,-0.441183,0.451394,-0.734811,-0.147773,0.023153,0.362523,0.084307,-0.174078,-0.388001,0.231893,0.41041
471743,6,0.035001,0.045575,1,2,1,658,True,True,True,3,3,3,3,2,544,98,False,restaurant,7,2019,12,0,False,4,5,True,639.955566,532.742432,-95.955597,-0.079024,0.464177,0.021083,-0.682393,-0.10307,0.104121,0.175629,-0.089554,0.404134,-0.168858,-0.230274,0.070082,-0.170812,-0.149579,0.170631,-0.607221,-0.091658,0.069804,-0.23435,-0.30769,-0.113724,-0.780718,-0.481394,-0.428563,0.033801,0.02885,0.25776,-0.358517,-0.503118,-0.310148,-0.465773,-0.316208,0.260485,-0.313596,0.594308,-0.12124,-0.115578,-0.154527,0.324453,-0.011011,0.209795,-0.094523,-0.252922,0.043142,-0.008492,0.022023,-0.175689,-0.109226,0.000681,0.043367
651872,5,0.008482,0.183707,1,2,1,393,True,True,True,3,3,3,3,1,1060,201,True,restaurant,5,2019,6,0,False,2,2,True,639.955566,532.742432,420.044403,-0.374734,0.317017,0.019842,-0.302896,-0.473648,0.073524,0.299914,0.046909,0.394122,0.524828,-0.064547,-0.057209,-0.164429,-0.320499,-0.021354,0.667428,-0.059192,-0.457868,0.182565,-0.24559,0.007257,-0.366044,-0.026976,-0.241783,0.202669,0.663131,0.517704,-0.432478,-0.215998,-0.250849,-0.238095,0.55195,-0.105194,0.177197,0.845513,0.01641,0.287799,-0.399746,-0.206951,-0.188451,-0.096079,-0.133126,-0.203529,-0.724825,-0.013907,-0.230514,-0.345582,-0.48912,-0.399413,-0.149543
361024,0,0.113044,0.200348,1,0,1,4,True,False,True,0,0,0,0,0,325,52,True,restaurant,4,2020,5,3,False,2,5,True,639.955566,532.742432,-314.955597,-0.180681,0.023614,-0.10399,-0.731153,-0.026752,0.253059,0.671096,-0.216157,-0.03267,-0.314932,-0.146187,-0.293769,0.035029,0.140799,-0.205075,-0.139843,-0.68421,0.080507,0.571595,-0.308306,-0.120389,-0.626092,0.530713,-0.465187,0.035872,-0.364389,0.346865,-0.521696,0.127112,-0.541459,-0.210797,-0.133499,0.643843,0.047296,-0.004573,-0.37294,0.043371,-0.083215,0.096944,-0.1963,0.771172,-0.360444,-0.485713,-0.190156,0.134873,-0.076158,-0.3733,-0.131175,0.112931,-0.230301
512239,1,0.113044,0.200348,1,0,0,75,True,False,False,2,2,2,2,1,318,59,True,restaurant,13,2019,10,1,False,4,5,True,639.955566,532.742432,-321.955597,-0.013276,0.533177,0.338198,-0.579823,-0.264113,-0.436391,0.654914,-0.359485,0.230481,-0.031154,-0.43709,-0.2546,-0.178335,-0.164152,0.020317,-0.489766,-0.122146,0.214201,0.390073,-0.201521,0.059544,-0.515621,-0.167281,-0.341711,-0.107389,0.279075,0.346947,-0.890139,-0.21998,0.059269,-0.655289,0.081459,0.233596,-0.045654,0.701849,-0.106562,0.604701,-0.292542,-0.118461,-0.295983,0.169136,-0.140318,0.423756,-0.121397,0.242513,0.169317,-0.109272,-0.137999,0.103012,-0.100314


In [5]:
df.shape

(1000066, 80)

In [6]:
df.dtypes.value_counts()

float32     53
int32       16
bool         6
float64      2
int64        2
category     1
Name: count, dtype: int64

# Models

## Create target variable

- DropColumnsTransformer

In [7]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")

    def set_output(self, *, transform=None):
        return self

- TargetFromStarsTransformer

In [8]:
class TargetFromStarsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column="remainder__stars"):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        def classify(stars):
            if stars <= 2:
                return 0  # negative
            elif stars == 3:
                return 1  # neutral
            else:
                return 2  # positive

        X["target"] = X[self.column].apply(classify)
        X.drop(columns=[self.column], inplace=True)
        return X

    def set_output(self, *, transform=None):
        return self

In [9]:
create_target_pipe = Pipeline(
    [
        ("create_target", TargetFromStarsTransformer(column="remainder__stars")),
        ("drop_stars", DropColumnsTransformer(columns=["remainder__stars"])),
    ]
)

In [11]:
df_target = create_target_pipe.fit_transform(df)

## Data Split

In [12]:
def split_data(df, target_column="target", test_size=0.2, random_state=42):
    y = df[target_column]
    X = df.drop(columns=[target_column])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    return (
        X_train.reset_index(drop=True),
        X_test.reset_index(drop=True),
        y_train.reset_index(drop=True),
        y_test.reset_index(drop=True),
    )

In [13]:
X_train, X_test, y_train, y_test = split_data(df_target)

## Model-Dependent Transformation

### Encode categorical features

In [None]:
def transform_with_names(pipeline, X, encode_cols):
    X_enc = pipeline.transform(X)
    onehot_cols = pipeline.named_transformers_["onehot"].get_feature_names_out(
        encode_cols
    )
    bool_cols = pipeline.transformers_[1][2]
    passthrough_cols = [col for col in X.columns if col not in encode_cols + bool_cols]

    all_columns = list(onehot_cols) + bool_cols + passthrough_cols
    return pd.DataFrame(X_enc, columns=all_columns, index=X.index)

In [15]:
categorical_str_object_cols = X_train.select_dtypes(
    include=["category", "string", "object"]
).columns.tolist()
boolean_cols = X_train.select_dtypes(include=["bool"]).columns.tolist()

In [None]:
# 1. Step
bool_to_int_transformer = FunctionTransformer(
    lambda X: X.astype(np.int8), validate=False
)

# 2. Step
encoding_mdt_pipe = ColumnTransformer(
    transformers=[
        (
            "onehot",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
            categorical_str_object_cols,
        ),
        ("bool_int", bool_to_int_transformer, boolean_cols),
    ],
    remainder="passthrough",
    force_int_remainder_cols=False,
)

- Encoding Test

In [18]:
# encoding_mdt_pipe.fit(X_train)
# X_train_encoded = transform_with_names(
#     encoding_mdt_pipe, X_train, categorical_str_object_cols
# )

### Impute missing values

In [None]:
class GroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.group_means_ = {}
        self.y_ = None

    def fit(self, X, y):
        X = pd.DataFrame(X).copy()
        self.y_ = pd.Series(y).reset_index(drop=True)

        self.group_means_ = {
            col: X[col].groupby(self.y_).mean().to_dict() for col in self.columns
        }
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy().reset_index(drop=True)
        y = self.y_

        for col in self.columns:
            means = self.group_means_[col]
            X[col] = X[col].where(~X[col].isna(), y.map(means))
        return X

    def set_output(self, *, transform=None):
        return self

In [20]:
def get_columns_with_na(X):
    return X.columns[X.isnull().any()].tolist()

In [None]:
columns_with_na = get_columns_with_na(df)

impute_missing_mdt_pipe = Pipeline(
    [("imputation", GroupMeanImputer(columns=columns_with_na))]
)

- Impute missing values Test

In [23]:
# X_train_imputed = impute_missing_mdt_pipe.fit_transform(X_train_encoded, y_train)

### Scale or normalize features

In [24]:
scaling_mdt_pipe = Pipeline([("minmax_scaler", MinMaxScaler())])

- Scale or normalize features Test

In [26]:
# X_train_scaled1 = scaling_mdt_pipe.fit_transform(X_train_imputed)
# X_train_scaled = pd.DataFrame(
#     X_train_scaled1, columns=X_train_imputed.columns, index=X_train_imputed.index
# )

### Dimensionality reduction

In [None]:
dimensionality_reduction_mdt_pipe = Pipeline(
    [
        ("drop_constant", DropConstantFeatures()),
        ("drop_correlated", DropCorrelatedFeatures(threshold=0.9)),
        (
            "target_selector",
            SelectBySingleFeaturePerformance(
                estimator=RandomForestClassifier(
                    n_estimators=50, random_state=42, n_jobs=-1
                ),
                scoring="f1_weighted",
                cv=3,
                threshold=0.01,
            ),
        ),
        (
            "sequential_selector",
            SequentialFeatureSelector(
                estimator=RidgeClassifier(),
                n_features_to_select=50,
                direction="forward",
                n_jobs=1,
            ),
        ),
    ]
)

- Dimensionality reduction Test

In [29]:
# X_train_reduced = dimensionality_reduction_mdt_pipe.fit_transform(
#     X_train_scaled, y_train
# )

In [30]:
# X_train_reduced_df = pd.DataFrame(
#     X_train_reduced,
#     columns=dimensionality_reduction_mdt_pipe.named_steps[
#         "sequential_selector"
#     ].get_feature_names_out(),
#     index=X_train_scaled.index,
# )

### Pipeline

In [31]:
mdt_preprocessor = Pipeline(
    steps=[
        ("encoding_mdt_pipe", encoding_mdt_pipe),
        ("impute_missing_mdt_pipe", impute_missing_mdt_pipe),
        ("scaling_mdt_pipe", scaling_mdt_pipe),
        ("dimensionality_reduction_mdt_pipe", dimensionality_reduction_mdt_pipe),
    ]
)

## Training

In [33]:
X_train_reduced = pd.read_parquet(BASE_DIR / "data/05_model_input/x_train.parquet")

### Base model + AutoML

In [34]:
# def summarize_classification(y_true, y_pred, model_name):
#     return {
#         "model": model_name,
#         "accuracy": accuracy_score(y_true, y_pred),
#         "precision_macro": precision_score(y_true, y_pred, average="macro"),
#         "recall_macro": recall_score(y_true, y_pred, average="macro"),
#         "f1_macro": f1_score(y_true, y_pred, average="macro"),
#     }

In [35]:
# df_models = X_train_reduced.copy()
# df_models["target"] = y_train.values

In [36]:
# X_train_cv, X_val_cv, y_train_cv, y_val_cv = split_data(df_models)

In [37]:
# model_candidates = {
#     "logistic": LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42),
#     "decision_tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
#     "xgboost": XGBClassifier(
#         objective="multi:softmax",
#         num_class=3,
#         eval_metric="mlogloss",
#         use_label_encoder=False,
#         random_state=42,
#         verbosity=0,
#     )
# }

In [38]:
# results = []

In [39]:
# for name, model in model_candidates.items():
#     model.fit(X_train_cv, y_train_cv)
#     y_pred = model.predict(X_val_cv)
#     results.append(summarize_classification(y_val_cv, y_pred, model_name=name))

In [40]:
# automl = AutoML()
# automl.fit(
#     X_train=X_train_cv, y_train=y_train_cv, task="classification", time_budget=60
# )
# y_pred_automl = automl.predict(X_val_cv)
# results.append(summarize_classification(y_val_cv, y_pred_automl, model_name="flaml"))

In [41]:
# results_df = pd.DataFrame(results).set_index("model")
# display(results_df.sort_values("f1_macro", ascending=False))

### Hyperparameter tuning

In [42]:
# def fit_grid_search(model, param_grid, X, y, scoring="f1_macro", cv=5, verbose=1):
#     return GridSearchCV(
#         estimator=model,
#         param_grid=param_grid,
#         scoring=scoring,
#         cv=cv,
#         n_jobs=-1,
#         return_train_score=True,
#         verbose=verbose,
#     ).fit(X, y)

In [43]:
# def summarize_grid_search(grid, scoring="f1_macro"):
#     print(f"Best {scoring}: {grid.best_score_:.4f}")
#     print(f"Best params: {grid.best_params_}")

In [44]:
# def grid_search_to_df(grid):
#     return pd.DataFrame(grid.cv_results_).sort_values(
#         "mean_test_score", ascending=False
#     )

- Apply Grid Search

In [45]:
# model = XGBClassifier(
#     objective="multi:softmax",
#     num_class=3,
#     use_label_encoder=False,
#     eval_metric="mlogloss",
#     random_state=42,
#     verbosity=0,
# )

# param_grid = {
#     "max_depth": [3, 5, 7],
#     "learning_rate": [0.01, 0.1, 0.3],
#     "n_estimators": [50, 100],
#     "subsample": [0.8, 1.0],
# }

In [46]:
# # Step 1: training
# grid = fit_grid_search(model, param_grid, X_train_reduced, y_train)

In [47]:
# # Step 2: summary
# summarize_grid_search(grid)

In [48]:
# # Step 3: df results
# df_grid = grid_search_to_df(grid)
# df_grid.head(5)

In [None]:
best_model_name_path = BASE_DIR / "data/06_models/best_model_name.txt"
# with open(best_model_name_path, "w") as f:
#     f.write("XGBClassifier")

In [None]:
best_params_path = BASE_DIR / "data/06_models/best_params.json"
# with open(best_params_path, "w") as f:
#     json.dump(grid.best_params_, f, indent=4)

### Pipeline final training

In [51]:
classifier_fn = best_model_name_path.read_text().strip()
with open(best_params_path) as f:
    best_params = json.load(f)

In [52]:
class TrainModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, classifier_fn, best_params):
        self.classifier_fn = classifier_fn
        self.best_params = best_params

    def fit(self, X, y):
        self.model_ = self.classifier_fn(**self.best_params)
        self.model_.fit(X, y)
        return self

    def transform(self, X):
        return X

    def set_output(self, *, transform=None):
        return self

In [53]:
training_preprocessor = Pipeline(
    [
        (
            "train_model",
            TrainModelTransformer(classifier_fn=XGBClassifier, best_params=best_params),
        )
    ]
)

- Pipeline final training Test

In [None]:
# training_preprocessor.fit(X_train_reduced, y_train)
model = training_preprocessor.named_steps["train_model"].model_

## Validation

### Pipeline

In [56]:
class ValidateModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, y_true, thresholds=None):
        self.model = model
        self.y_true = y_true
        self.thresholds = thresholds or {"f1_macro": 0.6}

    def fit(self, X, y=None):
        y_pred = self.model.predict(X)
        self.report_ = classification_report(self.y_true, y_pred, output_dict=True)

        for metric, threshold in self.thresholds.items():
            score = self.report_["macro avg"].get(metric.replace("_macro", ""), None)
            if score is not None and score < threshold:
                raise ValueError(f"[FAIL] {metric}={score:.4f} < threshold={threshold}")

        return self

    def transform(self, X):
        return self.model

    def set_output(self, *, transform=None):
        return self

In [57]:
validate_preprocessor = Pipeline(
    [
        (
            "validate_model",
            ValidateModelTransformer(
                model=model,
                y_true=y_test,
                thresholds={"f1_macro": 0.6, "recall_macro": 0.6},
            ),
        )
    ]
)