# CatBoost Regressor  - Master NB - One Block Code with Pipline

In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from catboost import CatBoostRegressor

import warnings # for ignoring unnecessary warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# ------------------------ #

# 1) Load the dataset 
df = pd.read_excel('sample_data.xlsx') 
# Assume the target variable in the Excel file is named "Target"
# Other variable names are not important here.

# 2) Separate features and target variable from the dataset
X = df.drop(columns=["Target"])
y = df["Target"]

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 3) Create transformers for data preprocessing (Only for numeric features)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

# Since CatBoost handles categorical features internally, we only preprocess numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

# 4) Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5) Preprocess numerical features
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# CatBoost requires the categorical feature indices, so we pass those manually
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_features]

# 6) Create and Train the Base Model using CatBoost
model = CatBoostRegressor(random_state=42, verbose=0)

model.fit(X_train, y_train, cat_features=cat_feature_indices)

# 7) Predictions and evaluation metrics
print("EVAL METRICS FOR BASE MODEL")
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Mean Squared Error: {mean_squared_error(y_true, y_pred)}")
    print(f"R2 Score: {r2_score(y_true, y_pred)}")
    print("\n")

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")

# 8) Hyperparameter Tuning
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5],
}

grid_search = GridSearchCV(CatBoostRegressor(cat_features=cat_feature_indices, random_state=42, verbose=0), 
                           param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

# 9) Build and evaluate the Best Model
print("")
print("----EVAL METRICS FOR BEST MODEL----")
print("")

best_model = grid_search.best_estimator_

y_train_best_pred = best_model.predict(X_train)
y_test_best_pred = best_model.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train (Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test (Best Model)")

# 10) Train the Final Model on the entire dataset
final_model = best_model.fit(X, y, cat_features=cat_feature_indices)

# 11) Save the Final Model
joblib.dump(final_model, 'final_CatBoost_regression_model.pkl')

print(" ")
print("Final model saved as 'final_CatBoost_regression_model.pkl'")


EVAL METRICS FOR BASE MODEL
Evaluation metrics for Train data:
Mean Squared Error: 0.24150818376520844
R2 Score: 0.990864785543126


Evaluation metrics for Test data:
Mean Squared Error: 1.6369041094445242
R2 Score: 0.9289401965032533


Best Hyperparameters: {'depth': 4, 'iterations': 300, 'l2_leaf_reg': 5, 'learning_rate': 0.1}

----EVAL METRICS FOR BEST MODEL----

Evaluation metrics for Train (Best Model) data:
Mean Squared Error: 0.43474657592233223
R2 Score: 0.9827325709312299


Evaluation metrics for Test (Best Model) data:
Mean Squared Error: 1.5775654881806778
R2 Score: 0.92234398405204


 
Final model saved as 'final_CatBoost_regression_model.pkl'
