In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np



pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def read_data(path):
    return pd.read_csv(path)
load_dotenv() 
data_path = os.getenv("TRAINING_DATA")
df = read_data(data_path)

In [None]:
"""
Split the dataset into features and target, then divide it into training and testing sets.

- X: feature matrix (all columns except the target)
- y: target variable ('HATSURESI')
- 90% of the data is used for training, 10% for testing
- The random_state ensures reproducible results
"""

# Separate features (X) and target variable (y)
X = df.drop(["HATSURESI","Unnamed: 0", "Unnamed: 0.1"], axis=1)  # Drop target column to create feature set
y = df["HATSURESI"]                 # Target variable to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## CatBoost

In [None]:
"""
Train and evaluate a CatBoost regression model on the test set.

The model is assessed using:
- R² Score (coefficient of determination)
- RMSE (Root Mean Squared Error)
- MAE (Mean Absolute Error)
- MAPE (Mean Absolute Percentage Error)
"""

# Initialize CatBoost regressor with silent output and fixed random seed
cat_model = CatBoostRegressor(verbose=0, random_state=42)

# Train the model on training data
cat_model.fit(X_train, y_train)

# Predict on the test set
y_pred_cat = cat_model.predict(X_test)

# Evaluate model performance using common regression metrics
r2_cat = r2_score(y_test, y_pred_cat)  # Coefficient of determination
rmse_cat = mean_squared_error(y_test, y_pred_cat) ** 0.5  # Root Mean Squared Error
mae_cat = mean_absolute_error(y_test, y_pred_cat)  # Mean Absolute Error
mape_cat = np.mean(np.abs((y_test - y_pred_cat) / y_test.replace(0, 1e-10))) * 100  # MAPE (with safe division)

# Print results
print("\n🐱 CatBoost Results:")
print(f"R² Score: {r2_cat:.4f}")
print(f"RMSE: {rmse_cat:.2f}")
print(f"MAE: {mae_cat:.2f}")
print(f"MAPE: {mape_cat:.2f}%")


In [None]:
"""
Hyperparameter tuning and evaluation for CatBoost using RandomizedSearchCV.

This script:
- Defines a parameter distribution for tuning
- Performs randomized search with cross-validation
- Fits the best CatBoost model found
- Evaluates performance on the test set using key regression metrics
"""

# Define parameter distribution for random search
param_dist = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [200, 300, 500],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128]
}

# Initialize base CatBoost regressor (silent mode)
cat_model = CatBoostRegressor(verbose=0, random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=30,  # Number of random combinations to try
    scoring='neg_mean_squared_error',
    cv=3,
    random_state=42,
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

# Fit the model with randomized search
random_search.fit(X_train, y_train)

# Retrieve the best model
best_cat_model = random_search.best_estimator_

# Make predictions on the test set
y_pred_cat = best_cat_model.predict(X_test)

# Calculate evaluation metrics
r2_cat = r2_score(y_test, y_pred_cat)  # R² Score
rmse_cat = mean_squared_error(y_test, y_pred_cat) ** 0.5  # RMSE
mae_cat = mean_absolute_error(y_test, y_pred_cat)  # MAE
mape_cat = np.mean(np.abs((y_test - y_pred_cat) / y_test.replace(0, 1e-10))) * 100  # MAPE (safe division)

# Print the results
print("\n🐱 Optimized CatBoost Results:")
print(f"Best Parameters: {random_search.best_params_}")
print(f"R² Score: {r2_cat:.4f}")
print(f"RMSE: {rmse_cat:.2f}")
print(f"MAE: {mae_cat:.2f}")
print(f"MAPE: {mape_cat:.2f}%")
