# Saiket System
## Task 4 : Churn Prediction Model

###Description:
#### Choose suitable machine learning algorithms (e.g., logistic regression, decision trees) for churn prediction. Split data into training and testing sets, train and evaluate multiple models using metrics like accuracy, precision, recall, and F1-score. Perform feature selection and hyperparameter tuning for optimal performance

In [18]:
# Importing Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

In [19]:
# 1. Load and Preprocess Data

# Load dataset
df = pd.read_csv("/content/Telco_Customer_Churn_Dataset.csv")

# Fix TotalCharges (common issue in Telco dataset)
df["TotalCharges"] = df["TotalCharges"].replace(" ", np.nan)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

# Target variable
y = df["Churn"].map({"No": 0, "Yes": 1})

# Drop ID and target from features
X = df.drop(["customerID", "Churn"], axis=1)

# One hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

print("Shape of features:", X.shape)
print("Shape of target:", y.shape)

Shape of features: (7043, 30)
Shape of target: (7043,)


In [20]:
# 2. Train Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (5634, 30)
Test shape: (1409, 30)


In [21]:
# 3. Feature Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# 4. Train Multiple Models

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

def evaluate_model(name, model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred)
    rec = recall_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)

    print(f"\nModel: {name}")
    print("Accuracy :", round(acc, 4))
    print("Precision:", round(prec, 4))
    print("Recall   :", round(rec, 4))
    print("F1-score :", round(f1, 4))
    print("\nClassification Report:\n", classification_report(y_te, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

    return model, acc, prec, rec, f1

results = {}

In [23]:
# Logistic Regression on scaled data
lr_model, lr_acc, lr_prec, lr_rec, lr_f1 = evaluate_model(
    "Logistic Regression",
    models["Logistic Regression"],
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)
results["Logistic Regression"] = (lr_acc, lr_prec, lr_rec, lr_f1)


Model: Logistic Regression
Accuracy : 0.807
Precision: 0.6584
Recall   : 0.5668
F1-score : 0.6092

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.57      0.61       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
 [[925 110]
 [162 212]]


In [24]:
# Decision Tree on original data
dt_model, dt_acc, dt_prec, dt_rec, dt_f1 = evaluate_model(
    "Decision Tree",
    models["Decision Tree"],
    X_train,
    X_test,
    y_train,
    y_test
)
results["Decision Tree"] = (dt_acc, dt_prec, dt_rec, dt_f1)


Model: Decision Tree
Accuracy : 0.7417
Precision: 0.5139
Recall   : 0.4947
F1-score : 0.5041

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.83      0.83      1035
           1       0.51      0.49      0.50       374

    accuracy                           0.74      1409
   macro avg       0.67      0.66      0.66      1409
weighted avg       0.74      0.74      0.74      1409

Confusion Matrix:
 [[860 175]
 [189 185]]


In [25]:
# Random Forest on original data
rf_model, rf_acc, rf_prec, rf_rec, rf_f1 = evaluate_model(
    "Random Forest",
    models["Random Forest"],
    X_train,
    X_test,
    y_train,
    y_test
)
results["Random Forest"] = (rf_acc, rf_prec, rf_rec, rf_f1)

print("\nSummary of model performance (Accuracy, Precision, Recall, F1):")
for name, metrics in results.items():
    print(name, ":", [round(m, 4) for m in metrics])


Model: Random Forest
Accuracy : 0.7864
Precision: 0.6237
Recall   : 0.492
F1-score : 0.5501

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.49      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.77      0.79      0.78      1409

Confusion Matrix:
 [[924 111]
 [190 184]]

Summary of model performance (Accuracy, Precision, Recall, F1):
Logistic Regression : [0.807, 0.6584, 0.5668, 0.6092]
Decision Tree : [0.7417, 0.5139, 0.4947, 0.5041]
Random Forest : [0.7864, 0.6237, 0.492, 0.5501]


In [26]:
# 5. Feature Importance (Random Forest)

feature_importances = pd.Series(
    rf_model.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

print("\nTop 10 Important Features for Churn (Random Forest):")
print(feature_importances.head(10))



Top 10 Important Features for Churn (Random Forest):
TotalCharges                      0.192096
tenure                            0.174733
MonthlyCharges                    0.168413
PaymentMethod_Electronic check    0.038771
InternetService_Fiber optic       0.038641
Contract_Two year                 0.030176
gender_Male                       0.028321
OnlineSecurity_Yes                0.028191
PaperlessBilling_Yes              0.025617
Partner_Yes                       0.023326
dtype: float64


In [17]:
# 6. Hyperparameter Tuning (Random Forest)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring="f1",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("\nBest parameters from GridSearchCV:")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

print("\nPerformance of Tuned Random Forest:")
print("Accuracy :", round(accuracy_score(y_test, y_pred_best), 4))
print("Precision:", round(precision_score(y_test, y_pred_best), 4))
print("Recall   :", round(recall_score(y_test, y_pred_best), 4))
print("F1-score :", round(f1_score(y_test, y_pred_best), 4))


Best parameters from GridSearchCV:
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

Performance of Tuned Random Forest:
Accuracy : 0.8034
Precision: 0.6611
Recall   : 0.5321
F1-score : 0.5896
