In [24]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings

warnings.filterwarnings('ignore')

In [25]:
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.shape

(7043, 21)

In [26]:
print("Converting TotalCharges to numeric...")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
nan_count = df['TotalCharges'].isnull().sum()
print(f"   Found {nan_count} missing values")


# Fill missing values with median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
print(f"   Filled with median: {df['TotalCharges'].median():.2f}")
df = df.drop('customerID', axis=1)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("   Encoded target: Yes→1, No→0")
X = df.drop('Churn', axis=1)
y = df['Churn']


Converting TotalCharges to numeric...
   Found 11 missing values
   Filled with median: 1397.47
   Encoded target: Yes→1, No→0


In [27]:
# Convert categorical variables to numeric codes (simple approach)
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category').cat.codes

print(f"Features shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

Features shape: (7043, 19)
Target distribution: {0: 5174, 1: 1869}


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")
model_dt = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)
y_pred
print(f"\nModel accuracy: {accuracy_score(y_test, y_pred):.4f}")
model_dt.score(X_test, y_test)
print(classification_report(y_test, y_pred, labels=[0, 1]))

Training set: 5634 samples
Test set: 1409 samples
Features: 19

Model accuracy: 0.7821
              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1035
           1       0.60      0.55      0.57       374

    accuracy                           0.78      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.78      0.78      0.78      1409



In [29]:
# ============ ADD THIS: BASELINE WITH HYPERPARAMETER TUNING ============
print("\n" + "="*50)
print("BASELINE MODEL - HYPERPARAMETER TUNING")
print("="*50)

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10]
}

# Create and fit tuned model
baseline_tuned = GridSearchCV(
    DecisionTreeClassifier(random_state=100),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

baseline_tuned.fit(X_train, y_train)
y_pred_tuned = baseline_tuned.predict(X_test)

print(f"Best Parameters: {baseline_tuned.best_params_}")
print(f"Tuned Accuracy: {accuracy_score(y_test, y_pred_tuned):.4f}")


BASELINE MODEL - HYPERPARAMETER TUNING
Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
Tuned Accuracy: 0.7850


In [37]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# First split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTEENN ONLY on the training data
sm = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

Xr_train, Xr_val, yr_train, yr_val = train_test_split(
    X_train_resampled,
    y_train_resampled,
    test_size=0.2,
    random_state=42
)

# Train on resampled training data
model_dt_smote = DecisionTreeClassifier(
    criterion='gini',
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)
model_dt_smote.fit(Xr_train, yr_train)

# Predict on validation set
yr_pred = model_dt_smote.predict(Xr_val)

print(f"Validation Accuracy: {accuracy_score(yr_val, yr_pred):.4f}")
print(classification_report(yr_val, yr_pred))
print(confusion_matrix(yr_val, yr_pred))

Validation Accuracy: 0.9139
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       431
           1       0.90      0.94      0.92       487

    accuracy                           0.91       918
   macro avg       0.92      0.91      0.91       918
weighted avg       0.91      0.91      0.91       918

[[382  49]
 [ 30 457]]


In [38]:
# ============ HYPERPARAMETER TUNING ============
print("\n" + "="*50)
print("SMOTEENN MODEL - HYPERPARAMETER TUNING")
print("="*50)

# Define parameter grid (make sure this matches your original param_grid)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [4, 6, 8, 10, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2']
}

# IMPORTANT: Create a new DecisionTreeClassifier for tuning
# Use ALL resampled training data (X_train_resampled, y_train_resampled) for GridSearchCV
smote_tuned = GridSearchCV(
    DecisionTreeClassifier(random_state=100),
    param_grid,
    cv=5,  # 5-fold cross-validation on the resampled training data
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit on ALL resampled training data
smote_tuned.fit(X_train_resampled, y_train_resampled)

# ============ FINAL EVALUATION ============
print("\n" + "="*50)
print("FINAL EVALUATION ON ORIGINAL TEST SET")
print("="*50)


SMOTEENN MODEL - HYPERPARAMETER TUNING
Fitting 5 folds for each of 360 candidates, totalling 1800 fits

FINAL EVALUATION ON ORIGINAL TEST SET


In [32]:
# ============ FINAL COMPARISON (UPDATE THIS) ============
print("\n" + "="*60)
print("FINAL COMPARISON OF ALL 4 MODELS")
print("="*60)

models = [
    ("Baseline (Fixed)", y_test, y_pred),
    ("Baseline (Tuned)", y_test, y_pred_tuned),
    ("SMOTEENN (Fixed)", yr_test, yr_pred),
    ("SMOTEENN (Tuned)", yr_test, yr_pred_tuned)
]

for name, y_true, y_pred in models:
    acc = accuracy_score(y_true, y_pred)
    prec = metrics.precision_score(y_true, y_pred, pos_label=1)
    rec = metrics.recall_score(y_true, y_pred, pos_label=1)
    f1 = metrics.f1_score(y_true, y_pred, pos_label=1)

    print(f"\n{name}:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")

# Find best model
best_acc = max([accuracy_score(y_true, y_pred) for _, y_true, y_pred in models])
for name, y_true, y_pred in models:
    if accuracy_score(y_true, y_pred) == best_acc:
        print(f"\n BEST MODEL: {name} ")
        break


FINAL COMPARISON OF ALL 4 MODELS

Baseline (Fixed):
  Accuracy:  0.7821
  Precision: 0.5971
  Recall:    0.5508
  F1-Score:  0.5730

Baseline (Tuned):
  Accuracy:  0.7850
  Precision: 0.6035
  Recall:    0.5535
  F1-Score:  0.5774

SMOTEENN (Fixed):
  Accuracy:  0.9292
  Precision: 0.9286
  Recall:    0.9469
  F1-Score:  0.9376

SMOTEENN (Tuned):
  Accuracy:  0.9377
  Precision: 0.9413
  Recall:    0.9484
  F1-Score:  0.9448

✨ BEST MODEL: SMOTEENN (Tuned) ✨
