In [1]:
# Import libraries
import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


**Explanation:**  
Imported required libraries for hyperparameter tuning:
- `RandomizedSearchCV` for efficient tuning
- `RandomForestClassifier` as the main model
- Evaluation metrics to compare tuned vs baseline model


In [2]:
# Load processed data
df = pd.read_csv("../data/processed_data.csv")

# Safety check: drop customerID if present
if "customerID" in df.columns:
    df.drop("customerID", axis=1, inplace=True)

X = df.drop("Churn", axis=1)
y = df["Churn"]


**Explanation:**  
Loaded the processed dataset and ensured no identifier columns (`customerID`) are present to avoid feature mismatch errors.


In [3]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


**Explanation:**  
Split the data while preserving churn distribution using `stratify=y`.


In [4]:
# Define Random Forest model with class weight
rf = RandomForestClassifier(
    random_state=42,
    class_weight={0:1, 1:2}
)


**Explanation:**  
Initialized Random Forest with class weighting to prioritize churn detection.


In [5]:
# Hyperparameter search space
param_dist = {
    "n_estimators": [200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}


**Explanation:**  
Defined hyperparameter search space focusing on:
- Model complexity
- Overfitting control
- Tree diversity


In [6]:
# Randomized Search CV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=25,
    scoring="recall",   # IMPORTANT: optimize churn recall
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 25 candidates, totalling 125 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_distributions,"{'max_depth': [None, 10, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,n_iter,25
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


**Explanation:**  
Performed RandomizedSearchCV:
- Optimized for **recall** (churn focus)
- 5-fold cross-validation
- More efficient than GridSearch for large spaces


In [7]:
# Best model from tuning
best_model = random_search.best_estimator_
print("Best Parameters:\n", random_search.best_params_)


Best Parameters:
 {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}


**Explanation:**  
Extracted the best-performing Random Forest model based on churn recall.


In [None]:
# Evaluate tuned model
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Classification Report (Tuned Model):\n")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


**Explanation:**  
Evaluated tuned model on unseen test data using:
- Precision, Recall, F1-score
- ROC-AUC for overall discrimination ability


In [None]:
# Threshold tuning (optional but powerful)
threshold = 0.35
y_pred_threshold = (y_prob > threshold).astype(int)

print(f"Classification Report (Threshold = {threshold}):\n")
print(classification_report(y_test, y_pred_threshold))


**Explanation:**  
Adjusted decision threshold to increase churn recall.
Lower threshold → more churn detected at the cost of precision.


In [9]:
# Save final tuned model
joblib.dump(best_model, "../data/customer_churn_model_tuned.pkl")
print("Tuned model saved to ../data/customer_churn_model_tuned.pkl")


Tuned model saved to ../data/customer_churn_model_tuned.pkl


**Explanation:**  
Saved the final tuned churn prediction model for deployment or further evaluation.
