# Hyperparameter Tuning with GridSearchCV

## 1. Import Dependencies

In [None]:
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

warnings.filterwarnings('ignore')

## 2. Load Processed Data

In [None]:
X_train = np.load('artifacts/X_train.npz')['arr_0']
Y_train = np.load('artifacts/Y_train.npz')['arr_0']
X_test = np.load('artifacts/X_test.npz')['arr_0']
Y_test = np.load('artifacts/Y_test.npz')['arr_0']

print("Data loaded successfully from artifacts.")

## 3. Hyperparameter Tuning for RandomForestClassifier

We will use **GridSearchCV** to find the optimal hyperparameters for the `RandomForestClassifier`.

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = StratifiedKFold(
    n_splits=5,  # Using 5 splits for a more robust evaluation
    random_state=42,
    shuffle=True
)

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid, 
    cv=cv, 
    n_jobs=-1, # Use all available cores
    scoring='f1',
    verbose=2
)

print("Starting GridSearchCV for RandomForestClassifier...")
grid_search.fit(X_train, Y_train)
print("GridSearchCV complete.")

## 4. Best Model and Parameters

In [None]:
print(f"Best F1 Score from GridSearch: {grid_search.best_score_:.4f}")
print("Best Parameters found:")
print(grid_search.best_params_)

best_rf_model = grid_search.best_estimator_

## 5. Evaluate Best Model on Test Set

In [None]:
Y_hat_test = best_rf_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_hat_test)
test_f1 = f1_score(Y_test, Y_hat_test)

print(f"--- Performance of Tuned RandomForest on Test Set ---")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1 Score: {test_f1:.4f}")

### Confusion Matrix

In [None]:
cm = confusion_matrix(Y_test, Y_hat_test)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Not Churned', 'Churned'],
            yticklabels=['Not Churned', 'Churned'])
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix for Tuned RandomForest')
plt.show()

## 6. Save the Tuned Model

In [None]:
joblib.dump(best_rf_model, 'model/tuned_random_forest_model.pkl')
print("Tuned RandomForest model saved to 'model/tuned_random_forest_model.pkl'.")