In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    auc,
    classification_report
)

In [21]:
df = pd.read_csv("Data.csv")

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

df.drop("customerID", axis=1, inplace=True)

df_encoded = pd.get_dummies(df, drop_first=True)

print("Preprocessing completed.")
print("Final dataset shape:", df_encoded.shape)


Preprocessing completed.
Final dataset shape: (7043, 31)


In [22]:
X = df_encoded.drop("Churn_Yes", axis=1)
y = df_encoded["Churn_Yes"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (7043, 30)
Target shape: (7043,)


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train-test split completed")


Train-test split completed


In [24]:
from sklearn.linear_model import LogisticRegression

best_model = LogisticRegression(max_iter=1000)
best_model.fit(X_train_scaled, y_train)

print("Model training completed ")

Model training completed 


In [25]:
y_pred = best_model.predict(X_test_scaled)
y_prob = best_model.predict_proba(X_test_scaled)[:, 1]

print("Predictions generated.")

Predictions generated.


In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8197303051809794
Precision: 0.683076923076923
Recall: 0.5951742627345844
F1 Score: 0.6361031518624641

Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.90      0.88      1036
        True       0.68      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.82      1409



In [27]:
coefficients = pd.Series(
    best_model.coef_[0],
    index=X.columns
).sort_values(ascending=False)

print("Top 10 features increasing churn risk:")
print(coefficients.head(10))

print("\nTop 10 features reducing churn risk:")
print(coefficients.tail(10))


Top 10 features increasing churn risk:
TotalCharges                      0.627137
InternetService_Fiber optic       0.622950
StreamingMovies_Yes               0.230364
StreamingTV_Yes                   0.180316
MultipleLines_Yes                 0.168662
PaperlessBilling_Yes              0.163692
PaymentMethod_Electronic check    0.150779
SeniorCitizen                     0.058427
MultipleLines_No phone service    0.034680
DeviceProtection_Yes              0.028529
dtype: float64

Top 10 features reducing churn risk:
StreamingTV_No internet service       -0.075657
StreamingMovies_No internet service   -0.075657
OnlineBackup_No internet service      -0.075657
OnlineSecurity_No internet service    -0.075657
TechSupport_Yes                       -0.120727
OnlineSecurity_Yes                    -0.158133
Contract_One year                     -0.268977
Contract_Two year                     -0.617473
MonthlyCharges                        -0.629389
tenure                                -1.32676

In [28]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

print("AUC Score:", roc_auc)

AUC Score: 0.8620040473257631


In [29]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

print("AUC Score:", roc_auc)

AUC Score: 0.8620040473257631


In [30]:
print("- The model shows strong discrimination ability based on AUC.")
print("- Contract type, tenure, and monthly charges are major churn drivers.")
print("- Customers with month-to-month contracts are at highest risk.")
print("- Long-tenure customers with high charges deserve proactive retention.")
print("- Model can be used to flag high-risk customers in advance.")


- The model shows strong discrimination ability based on AUC.
- Contract type, tenure, and monthly charges are major churn drivers.
- Customers with month-to-month contracts are at highest risk.
- Long-tenure customers with high charges deserve proactive retention.
- Model can be used to flag high-risk customers in advance.
