# CatBoost Classifier - Master NB - One Block Code with Pipline

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
import joblib
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# 1) Load the data
df = pd.read_excel('sample_data.xlsx')

# 2) Separate features (X) and target variable (y)
X = df.drop(columns=["Target"])
y = df["Target"]

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 3) Perform imputation for numeric data
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

# Preprocessor is only necessary for numeric columns, because CatBoost automatically handles categorical data
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)],
    remainder='passthrough'  # We leave the categorical columns as they are
)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

# 4) Create a pipeline with CatBoost Classifier
base_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(class_weights=class_weights_dict, cat_features=categorical_features.tolist(), random_state=42, silent=True))
])

# 5) Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 6) Train and evaluate the base model
base_pipeline.fit(X_train, y_train)

# 7) Predictions and evaluation metrics
def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred)}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

# Evaluate the training and testing results
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")

# 8) Hyperparameter search (GridSearch)
param_grid = {
    'classifier__iterations': [100, 200, 500],
    'classifier__depth': [4, 6, 8, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__l2_leaf_reg': [1, 3, 5, 7]
}

grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

# 9) Evaluate the best model
print("")
print("----EVAL METRICS FOR BEST MODEL----")
print("")

best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train (Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test (Best Model)")

# 10) Train the final model on the entire dataset
final_model = best_pipeline.fit(X, y)

# 11) Save the final model
joblib.dump(final_model, 'final_catboost_model.pkl')

print(" ")
print("Final model saved as 'final_catboost_model.pkl'")

Evaluation metrics for Train data:
Accuracy: 0.9875
F1 Score: 0.9878153665102932
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       359
           1       0.89      1.00      0.94        41

    accuracy                           0.99       400
   macro avg       0.95      0.99      0.97       400
weighted avg       0.99      0.99      0.99       400

Confusion Matrix:
[[354   5]
 [  0  41]]


Evaluation metrics for Test data:
Accuracy: 0.79
F1 Score: 0.8019159911569638
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88        90
           1       0.08      0.10      0.09        10

    accuracy                           0.79       100
   macro avg       0.49      0.48      0.48       100
weighted avg       0.81      0.79      0.80       100

Confusion Matrix:
[[78 12]
 [ 9  1]]


Best Hyperparameters: {'classifier__depth': 8, 'classifier_