<a href="https://colab.research.google.com/github/hibabtl/Customer_churn_detection/blob/main/RandomForestClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**In this notebook, we will apply Random Forest Classifier to predict customer churn using following steps:**

1. Data prep (load, clean, encode)

2. Train-test split

3. SMOTETomek (balancing)

4. Train Random Forest

5. Evaluate performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler

In [None]:
df=pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
# Convert TotalCharges to numeric (handle errors)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing TotalCharges
df.dropna(subset=['TotalCharges'], inplace=True)

# Drop customerID if present
df.drop(['customerID','MultipleLines','PaymentMethod','Partner','Dependents','gender'], axis=1, inplace=True)

# Encode target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Encode categorical features
df = pd.get_dummies(df, drop_first=True)


In [None]:
#train_test_split
X = df.drop('Churn', axis=1)
y = df['Churn']





In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
#feature scaling+SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Apply SMOTE-Tomek only to the scaled training set
# This will oversample the minority class and clean the decision boundary
smote_tomek = SMOTETomek(random_state=42)
X_train_res, y_train_res = smote_tomek.fit_resample(X_train_scaled, y_train)




In [None]:
# #TRAIN RANDOM FOREST
# rf = RandomForestClassifier(random_state=42)
# rf.fit(X_train_res, y_train_res)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [300],
    'max_depth': [10],
    'min_samples_split': [10],
    'min_samples_leaf': [4],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_res, y_train_res)

best_rf = grid.best_estimator_
print(best_rf)

RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=10,
                       n_estimators=300, random_state=42)


In [None]:
#EVALUATE
y_pred = grid.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[757 276]
 [ 84 290]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.73      0.81      1033
           1       0.51      0.78      0.62       374

    accuracy                           0.74      1407
   macro avg       0.71      0.75      0.71      1407
weighted avg       0.80      0.74      0.76      1407


Accuracy: 0.744136460554371


In [None]:
# Choose your optimal threshold
optimal_threshold = 0.2

# Use this new threshold to make predictions
y_pred_optimal = (y_churn_proba > optimal_threshold).astype(int)

# Print the new classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix with optimal threshold ({}):\n".format(optimal_threshold), confusion_matrix(y_test, y_pred_optimal))
print("\nClassification Report with optimal threshold ({}):\n".format(optimal_threshold), classification_report(y_test, y_pred_optimal))
print("\nAccuracy with optimal threshold ({}):".format(optimal_threshold), accuracy_score(y_test, y_pred_optimal))

Confusion Matrix with optimal threshold (0.2):
 [[519 514]
 [ 27 347]]

Classification Report with optimal threshold (0.2):
               precision    recall  f1-score   support

           0       0.95      0.50      0.66      1033
           1       0.40      0.93      0.56       374

    accuracy                           0.62      1407
   macro avg       0.68      0.72      0.61      1407
weighted avg       0.81      0.62      0.63      1407


Accuracy with optimal threshold (0.2): 0.6154939587775409
