<a href="https://colab.research.google.com/github/khietvuarong/ML-Basics-Exercise/blob/main/Part_2_Predict_Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data Source:
# Telco Customer Churn Dataset
# https://www.kaggle.com/datasets/blastchar/telco-customer-churn
# Dataset contains 7,000+ customer records

# Improvement:
# 7,000+ real customers, Accuracy, ROC-AUC, Classification Report.
# Proper probability + evaluation,Production-style pipeline,Real telecom churn dataset.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Clean data
df = df.dropna()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()

# Convert target variable to binary (Yes=1, No=0)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Select relevant features
X = df[['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract']]
y = df['Churn']

# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['tenure', 'MonthlyCharges', 'TotalCharges']),
        ('cat', OneHotEncoder(drop='first'), ['Contract'])
    ]
)

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Predict churn probability for a new customer
new_customer = pd.DataFrame({
    'tenure': [12],
    'MonthlyCharges': [75],
    'TotalCharges': [900],
    'Contract': ['Month-to-month']
})

churn_probability = model.predict_proba(new_customer)[0][1]

threshold = 0.5
churn_prediction = 1 if churn_probability > threshold else 0

print(f"\nChurn Probability: {churn_probability:.2f}")
print(f"Churn Prediction (1 = churn, 0 = no churn): {churn_prediction}")

# Display model coefficients
feature_names = (
    model.named_steps['preprocessor']
    .get_feature_names_out()
)

coefficients = model.named_steps['classifier'].coef_[0]

print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.4f}")


Model Accuracy: 0.7803837953091685
ROC-AUC Score: 0.8148298657666005

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.89      0.86      1033
           1       0.61      0.48      0.54       374

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.70      1407
weighted avg       0.77      0.78      0.77      1407


Churn Probability: 0.52
Churn Prediction (1 = churn, 0 = no churn): 1

Model Coefficients:
num__tenure: -1.3255
num__MonthlyCharges: 0.7668
num__TotalCharges: 0.4985
cat__Contract_One year: -1.1412
cat__Contract_Two year: -1.9652
