# Retrain Best ML Model on Combined Train+Eval Data

In this notebook we retrain our best performing ML model (CatBoost with combined tabular and text features) on the combined training and evaluation datasets. This allows us to leverage all available data for the final model while maintaining proper test set separation.

In [None]:
import os

import numpy as np
import pandas as pd
from dotenv import load_dotenv

from graildient_descent.model import Model
from graildient_descent.utils import set_random_seed

In [None]:
load_dotenv()
random_state = set_random_seed()

## Load Train and Eval Data

In [None]:
# Load train and eval datasets
train_data = pd.read_csv("../data/splits/25k/train_25k.csv")
eval_data = pd.read_csv("../data/splits/25k/eval_25k.csv")

# Combine datasets
combined_data = pd.concat([train_data, eval_data], ignore_index=True)

# Prepare features and target
X = combined_data.drop(columns=["sold_price", "id", "parsing_date"])
y = combined_data["sold_price"]
y_log = np.log1p(y)  # Log transform target as per best configuration

## Initialize Best Model Configuration

In [None]:
# Configure best model parameters based on experiments
model_configs = {
    "model_name": "catboost_v1",
    "estimator_class": "catboost",
    "use_tab_features": True,
    "use_text_features": True,
    "transformer_params": {
        "catboost_cols": [
            "designer",
            "color",
            "size",
            "subcategory",
        ],  # Mid/high cardinality features
        "ohe_cols": ["department", "category"],  # Low cardinality features
        "oe_cols": ["condition"],  # Ordinal feature
    },
    "extractor_params": {
        "vectorizer_class": "tfidf",
        "vectorizer_params": {"ngram_range": (1, 3), "min_df": 5},
        "reducer_class": "pca",
        "reducer_params": {"n_components": 100},
    },
}

## Train and Save Final Model

In [None]:
# Initialize model
model = Model(**model_configs)

# Train model on combined data
model.fit(X, y_log)

# Create directory for best models
os.makedirs("../models/benchmarks/", exist_ok=True)

# Save model
model.save_model("../models/benchmarks/")
print(f"Model saved to 'models/benchmarks/{model_configs['model_name']}.pkl'")