In [4]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek
from collections import Counter

In [7]:
from scipy.stats.mstats import winsorize

In [9]:
from google.colab import files
uploaded = files.upload()

Saving Churn_Modelling.csv to Churn_Modelling.csv


In [10]:
# Step 2: Load Data

df = pd.read_csv("Churn_Modelling.csv")
print(df.head())


   RowNumber  CustomerId   Surname  CreditScore Geography  Gender   Age  \
0          1    15634602  Hargrave        619.0    France  Female  42.0   
1          2    15647311      Hill        608.0     Spain  Female  41.0   
2          3    15619304      Onio        502.0    France  Female  42.0   
3          4    15701354      Boni        699.0    France  Female  39.0   
4          5    15737888  Mitchell        850.0     Spain  Female  43.0   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0     2.0       0.00            1.0        1.0             1.0   
1     1.0   83807.86            1.0        0.0             1.0   
2     8.0  159660.80            3.0        1.0             0.0   
3     1.0       0.00            2.0        0.0             0.0   
4     2.0  125510.82            1.0        NaN             1.0   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4      

In [11]:
print(df['Exited'].value_counts(normalize=True))  # checking imbalance

Exited
0    0.796241
1    0.203759
Name: proportion, dtype: float64


In [12]:
# Step 3: Select Features & Target

# Drop non-useful columns
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)





In [13]:
# Encode categorical variables (simple get_dummies)
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1,False,False,False
1,608.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0,False,True,False
2,502.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1,False,False,False
3,699.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,0,False,False,False
4,850.0,43.0,2.0,125510.82,1.0,,1.0,79084.1,0,False,True,False


In [14]:
X = df.drop("Exited", axis=1)
y = df["Exited"]

print("Feature shape:", X.shape)

Feature shape: (10002, 11)


Winsorizing

* Problem it solves:

  Sometimes outliers are too extreme and may harm the   model even after scaling.

  Example: a few customers with unrealistic Balance   values (like 10× larger than typical).

* What it does:

  Instead of removing rows, caps extreme values at  certain percentiles.

  Example: 1st percentile → values below this are set   to that percentile value.

  99th percentile → values above this are capped at   that level.

* When to use:

  When you don’t want to drop records but still need  to reduce outlier impact.
  
  Safer than deleting rows since you keep all data.

In [15]:
# Step 4: Winsorizing

# Applying winsorization to selected numeric columns
print("Before winsorization:")
print(X[['CreditScore', 'Balance', 'EstimatedSalary']].describe())

# Fix winsorize output (convert masked array -> normal array)
from scipy.stats.mstats import winsorize

for col in ['CreditScore', 'Balance', 'EstimatedSalary']:
    X[col] = winsorize(X[col], limits=[0.01, 0.01]).data  # use .data to avoid NaN

# After all transformations, check for NaNs
print("NaN count before filling:", X.isna().sum().sum())

# Fill any remaining NaNs (safe fallback)
X = X.fillna(0)


print("After winsorization:")
print(X[['CreditScore', 'Balance', 'EstimatedSalary']].describe())


Before winsorization:
       CreditScore        Balance  EstimatedSalary
count  9998.000000   10000.000000     10002.000000
mean    650.522404   76481.519210    100083.331145
std      96.647651   62393.568682     57508.117802
min     350.000000       0.000000        11.580000
25%     584.000000       0.000000     50983.750000
50%     652.000000   97198.540000    100185.240000
75%     718.000000  127644.240000    149383.652500
max     850.000000  250898.090000    199992.480000
NaN count before filling: 19
After winsorization:
        CreditScore        Balance  EstimatedSalary
count  10002.000000   10002.000000     10002.000000
mean     650.816537   76391.117164    100082.672288
std       96.172932   62188.054457     57475.059449
min      432.000000       0.000000      1843.240000
25%      584.000000       0.000000     50983.750000
50%      652.000000   97221.520000    100185.240000
75%      718.000000  127653.825000    149383.652500
max      850.000000  186347.970000    198069.710000


 Robust Scaling

* Problem it solves:

  Features like Balance or Salary can have extreme   values (outliers).
  
  Standard scaling (z-score) and MinMax scaling get    heavily affected by outliers → pulling the scale   too much.
  
* What it does:

  Instead of using mean and standard deviation  (sensitive to outliers), RobustScaler uses:

  Median (central point)

  Interquartile range (IQR = Q3 − Q1) (spread of  middle 50%)

  So outliers don’t distort the scaling.

* Robust Scaling Formula

  For a feature \( x \):

  $
  x' = \frac{x - \text{median}(x)}{\text{IQR}(x)}
  $

  where:

  - $ \text{median}(x) $ = 50th percentile (middle value)  
  - $ \text{IQR}(x) = Q_3 - Q_1 $ = difference between 75th percentile (Q3) and 25th percentile (Q1)

* When to use:

  When dataset has skewed distributions or many  outliers.

  E.g., in churn data, one customer may have an   extremely high balance compared to others.

In [16]:
# Step 5: Robust Scaling

scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [17]:
print("Scaled data sample:")
print(X_scaled.head())

Scaled data sample:
   CreditScore       Age  Tenure   Balance  NumOfProducts  HasCrCard  \
0    -0.246269  0.416667   -0.75 -0.761603            0.0        0.0   
1    -0.328358  0.333333   -1.00 -0.105078            0.0       -1.0   
2    -1.119403  0.416667    0.75  0.489130            2.0        0.0   
3     0.350746  0.166667   -1.00 -0.761603            1.0       -1.0   
4     1.477612  0.500000   -0.75  0.221609            0.0       -1.0   

   IsActiveMember  EstimatedSalary  Geography_Germany  Geography_Spain  \
0             0.0         0.011826                0.0              0.0   
1             0.0         0.125583                0.0              1.0   
2            -1.0         0.139699                0.0              0.0   
3            -1.0        -0.064620                0.0              0.0   
4             0.0        -0.214443                0.0              1.0   

   Gender_Male  
0         -1.0  
1         -1.0  
2         -1.0  
3         -1.0  
4         -1.0  


In [18]:
import numpy as np
from sklearn.preprocessing import RobustScaler

data = np.array([[1], [2], [3], [4], [100]])

scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

print("Original:\n", data.flatten())
print("Robust Scaled:\n", scaled_data.flatten())

Original:
 [  1   2   3   4 100]
Robust Scaled:
 [-1.  -0.5  0.   0.5 48.5]


In [22]:
# Step 6: Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)





Class distribution [5574 1427]


In [31]:
# Step 7: Baseline Model (Without Resampling)
print("Class distribution", np.bincount(y_train))
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Baseline Results:")
print(classification_report(y_test, y_pred))

Class distribution [5574 1427]
Baseline Results:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      2390
           1       0.61      0.19      0.29       611

    accuracy                           0.81      3001
   macro avg       0.71      0.58      0.59      3001
weighted avg       0.78      0.81      0.77      3001



**Handling Class Imbalance (Undersampling / OverSampling)**

* Problem it solves:

  In churn prediction (like our dataset), usually:
  
  80–90% customers stay (Exited=0)
  
  10–20% churn (Exited=1)
  
  A model trained on this will just predict “No   churn” most of the time → high accuracy, but  useless for detecting churners.




* **Undersampling**: Reduce majority class (e.g., keep only 10k “No churn” customers to match 2k “Churn” ones).

  Fast, simple
  
  Risk of losing important majority data

In [21]:
# Random Undersampling

undersample = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

print("Class distribution after random undersampling:", np.bincount(y_train_under))

clf.fit(X_train_under, y_train_under)
y_pred_under = clf.predict(X_test)

print("Undersampling Results:")
print(classification_report(y_test, y_pred_under))


Class distribution after undersampling: [1427 1427]
Undersampling Results:
              precision    recall  f1-score   support

           0       0.91      0.71      0.79      2390
           1       0.38      0.72      0.50       611

    accuracy                           0.71      3001
   macro avg       0.65      0.71      0.65      3001
weighted avg       0.80      0.71      0.73      3001



### Tomek Links (Undersampling)

- A **Tomek Link** is a pair of samples:
  - They belong to **different classes** (e.g., churned vs not churned).
  - They are **each other’s nearest neighbor**.

- These pairs usually occur at the **class boundary** where overlap or confusion exists.

- **How it works:**
  - Identify Tomek Link pairs.
  - Remove the **majority class sample** from each pair.
  - This cleans the boundary and reduces class overlap.

Benefit: Unlike random undersampling, Tomek Links remove only the **problematic majority samples** that confuse the classifier.


In [29]:
# TomekLinks Undersampling

TL_undersample = TomekLinks(sampling_strategy="auto")
X_train_TL, y_train_TL = TL_undersample.fit_resample(X_train, y_train)

print("Class distribution after TomekLinks undersampling:", np.bincount(y_train_TL))

clf.fit(X_train_TL, y_train_TL)
y_pred_TL = clf.predict(X_test)

print("Undersampling Results:")
print(classification_report(y_test, y_pred_TL))


Class distribution after TomekLinks undersampling: [1427 1427]
Undersampling Results:
              precision    recall  f1-score   support

           0       0.91      0.71      0.79      2390
           1       0.38      0.72      0.50       611

    accuracy                           0.71      3001
   macro avg       0.65      0.71      0.65      3001
weighted avg       0.80      0.71      0.73      3001



### Edited Nearest Neighbors (ENN) Undersampling

- **Idea:** ENN removes **noisy samples** that don’t agree with their neighbors.  
- For each data point, check its *k nearest neighbors* (commonly k=3).  
- If the point’s class label is **different from the majority of its neighbors**, it is considered noise and removed.  

**Benefit:**  
- Cleans the dataset by removing mislabeled or out-of-place points.  
- Helps classifiers build clearer decision boundaries.  

**Note:**  
- ENN can remove **both majority and minority samples**, unlike Tomek Links which usually remove only majority samples.


In [30]:
# ENN Undersampling

ENN_undersample = EditedNearestNeighbours()
X_train_ENN, y_train_ENN = ENN_undersample.fit_resample(X_train, y_train)

print("Class distribution after ENN undersampling:", np.bincount(y_train_ENN))

clf.fit(X_train_ENN, y_train_ENN)
y_pred_ENN = clf.predict(X_test)

print("Undersampling Results:")
print(classification_report(y_test, y_pred_ENN))

Class distribution after ENN undersampling: [3970 1427]
Undersampling Results:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      2390
           1       0.52      0.47      0.49       611

    accuracy                           0.80      3001
   macro avg       0.69      0.68      0.69      3001
weighted avg       0.80      0.80      0.80      3001



**Random Over Sampler:** Random oversampling randomly chooses minority samples (with replacement) and adds duplicates until class counts match.

In [28]:
ROS = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ROS.fit_resample(X_train, y_train)

print("Class distribution after Random Over Sampler:", np.bincount(y_train_ros))

clf.fit(X_train_ros, y_train_ros)
y_pred_ros = clf.predict(X_test)

print("Random Over Sampler Results:")
print(classification_report(y_test, y_pred_ros))

Class distribution after Random Over Sampler: [5574 5574]
Random Over Sampler Results:
              precision    recall  f1-score   support

           0       0.91      0.71      0.79      2390
           1       0.38      0.71      0.50       611

    accuracy                           0.71      3001
   macro avg       0.64      0.71      0.65      3001
weighted avg       0.80      0.71      0.73      3001



### SMOTE (Synthetic Minority Oversampling Technique)

- **Idea:** Instead of duplicating minority samples (like Random Oversampling), SMOTE creates **synthetic samples**.
- **How it works:**
  1. For each minority sample, find its *k nearest minority neighbors* (default k=5).
  2. Randomly choose one neighbor.
  3. Create a synthetic sample **between the two points** by interpolation.
- **Result:** The minority class grows with *new, artificial points* that are not exact copies.

 **Benefit:** Reduces overfitting (compared to simple duplication).  
 **Limitation:** Can generate samples in regions where classes overlap → may introduce noise.

---

In [23]:
# SMOTE Oversampling

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:", np.bincount(y_train_smote))

clf.fit(X_train_smote, y_train_smote)
y_pred_smote = clf.predict(X_test)

print("SMOTE Results:")
print(classification_report(y_test, y_pred_smote))


Class distribution after SMOTE: [5574 5574]
SMOTE Results:
              precision    recall  f1-score   support

           0       0.91      0.71      0.79      2390
           1       0.38      0.71      0.50       611

    accuracy                           0.71      3001
   macro avg       0.64      0.71      0.65      3001
weighted avg       0.80      0.71      0.73      3001



### Borderline-SMOTE

- **Idea:** A smarter version of SMOTE that only generates synthetic samples for **minority points near the decision boundary**.
- **How it works:**
  1. Identify minority samples whose neighbors are **mostly majority class** → these are "borderline" points.
  2. Generate synthetic samples **around these borderline cases**.
- **Result:** Focuses oversampling where it matters most: the **class boundary**.

**Benefit:** Strengthens the classifier in the hardest-to-learn region (the boundary).  
**Limitation:** May overfit borderline noise if the boundary is very fuzzy.

In [32]:
#BorderlineSMOTE

BLsmote = BorderlineSMOTE(random_state=42)
X_train_blsmote, y_train_blsmote = BLsmote.fit_resample(X_train, y_train)

print("Class distribution after BL SMOTE:", np.bincount(y_train_blsmote))

clf.fit(X_train_blsmote, y_train_blsmote)
y_pred_blsmote = clf.predict(X_test)

print("BL SMOTE Results:")
print(classification_report(y_test, y_pred_blsmote))


Class distribution after BL SMOTE: [5574 5574]
BL SMOTE Results:
              precision    recall  f1-score   support

           0       0.91      0.68      0.78      2390
           1       0.37      0.73      0.49       611

    accuracy                           0.69      3001
   macro avg       0.64      0.71      0.64      3001
weighted avg       0.80      0.69      0.72      3001



* When to use:

  Whenever target variable distribution is skewed   (imbalanced).
  
  Critical in classification tasks like fraud   detection, churn prediction, medical diagnosis.