# Telco Cusstomer Churn: Feature Engineering

This notebook performs feature engineering on the cleaned Telco Customer Churn dataset.

## Table of Contents 
1. [Feature Selection](#1-feature-selection)  
2. [Test-Train Split](#2-test-train-split)  
3. [Feature Encoding](#3-feature-encoding) 
4. [Data Export](#4-data-export)  

## Configure Settings and Data Loading

In [200]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import json
import seaborn as sns
import numpy as np
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
RANDOM_STATE = 42

In [176]:
# Load dataset
test_df = pd.read_csv(r"C:\Users\linto\Code\churn-x\ml\notebooks\artifacts\eda\test_df.csv")
train_df = pd.read_csv(r"C:\Users\linto\Code\churn-x\ml\notebooks\artifacts\eda\train_df.csv")
print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

Train dataset shape: (5634, 29)
Test dataset shape: (1409, 29)


In [177]:
# Display first 3 rows 
train_df.head(3)

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV
0,United States,California,San Diego,92119,32.803,-117.027,Male,No,Yes,No,71,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),98.65,6962,No,0,28,5101
1,United States,California,Jamestown,95327,37.848,-120.487,Female,No,No,No,16,Yes,No,Fiber optic,No,Yes,Yes,No,No,Yes,Month-to-month,Yes,Credit card (automatic),93.2,1573,Yes,1,75,5497
2,United States,California,Littlerock,93543,34.505,-117.955,Female,No,No,No,32,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),104.05,3416,No,0,60,2335


## 1. Feature Selection

### 1.1 Categorical Features

In [178]:
cat_cols = train_df.select_dtypes(include=["object", "category"]).columns.tolist()
cat_cols

['Country',
 'State',
 'City',
 'Gender',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'Churn Label']

**Calculate WoE and IV**

In [179]:
def calculate_woe_iv(df, feature, target):
    grouped = df.groupby(feature)[target].agg(['count','sum'])
    grouped = grouped.rename(columns={'count': 'total', 'sum': 'good'})
    grouped['bad']=grouped['total']-grouped['good']
    
    total_good = grouped['good'].sum()
    total_bad = grouped['bad'].sum()
    
    grouped['good_pct'] = grouped['good'] / total_good
    grouped['bad_pct'] = grouped['bad'] / total_bad
    grouped['woe'] = np.log(grouped['good_pct']/ grouped['bad_pct'])
    grouped['iv'] = (grouped['good_pct'] -grouped['bad_pct'])*grouped['woe']
    
    grouped['woe'] = grouped['woe'].replace([np.inf, -np.inf], 0)
    grouped['iv'] = grouped['iv'].replace([np.inf, -np.inf], 0)
    
    total_iv = grouped['iv'].sum()
    
    return grouped, total_iv

grouped, total_iv = calculate_woe_iv(train_df, 'Multiple Lines', 'Churn Value')
grouped

Unnamed: 0_level_0,total,good,bad,good_pct,bad_pct,woe,iv
Multiple Lines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
No,2730,667,2063,0.454,0.495,-0.087,0.004
No phone service,559,134,425,0.091,0.102,-0.112,0.001
Yes,2345,668,1677,0.455,0.403,0.122,0.006


In [180]:
iv_values = {}

for feature in cat_cols:
    _, iv = calculate_woe_iv(train_df, feature, 'Churn Value')
    iv_values[feature] = iv
        
iv_values

{'Country': np.float64(0.0),
 'State': np.float64(0.0),
 'City': np.float64(0.4318949446462),
 'Gender': np.float64(9.290598750511442e-05),
 'Senior Citizen': np.float64(0.10673362185847429),
 'Partner': np.float64(0.12062557806691623),
 'Dependents': np.float64(0.4388605628147141),
 'Phone Service': np.float64(0.0013429938939725418),
 'Multiple Lines': np.float64(0.011140627144188186),
 'Internet Service': np.float64(0.6142879801128276),
 'Online Security': np.float64(0.7224550380700087),
 'Online Backup': np.float64(0.5231415596776513),
 'Device Protection': np.float64(0.498951761594428),
 'Tech Support': np.float64(0.7046888556475246),
 'Streaming TV': np.float64(0.3903356117744413),
 'Streaming Movies': np.float64(0.3921919499129765),
 'Contract': np.float64(1.2735144473378586),
 'Paperless Billing': np.float64(0.21602074020212217),
 'Payment Method': np.float64(0.45859847400214265),
 'Churn Label': np.float64(0.0)}

In [181]:
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

iv_df = pd.DataFrame(list(iv_values.items()), columns=['Feature', 'IV'])
iv_df = iv_df.sort_values(by='IV', ascending=False)
iv_df

Unnamed: 0,Feature,IV
16,Contract,1.274
10,Online Security,0.722
13,Tech Support,0.705
9,Internet Service,0.614
11,Online Backup,0.523
12,Device Protection,0.499
18,Payment Method,0.459
6,Dependents,0.439
2,City,0.432
15,Streaming Movies,0.392


In [182]:
# select features that has IV > 0.02
selected_cat_features_iv = [feature for feature, iv in iv_values.items() if iv > 0.02]
selected_cat_features_iv

['City',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method']

### 1.2 Numerical Columns

In [183]:
num_cols = ['Zip Code', 'Latitude', 'Longitude', 'Tenure Months', 'Monthly Charges', 'Total Charges', 'CLTV']
num_cols

['Zip Code',
 'Latitude',
 'Longitude',
 'Tenure Months',
 'Monthly Charges',
 'Total Charges',
 'CLTV']

**Scaling**

In [184]:
scaler = MinMaxScaler()

train_df_scaled = pd.DataFrame(
    scaler.fit_transform(train_df[num_cols]),
    columns=num_cols,
    index=train_df.index
)

test_df_scaled = pd.DataFrame(
    scaler.transform(test_df[num_cols]),
    columns=num_cols,
    index=test_df.index
)

train_df_scaled.head()


Unnamed: 0,Zip Code,Latitude,Longitude,Tenure Months,Monthly Charges,Total Charges,CLTV
0,0.344,0.026,0.72,0.986,0.801,0.802,0.689
1,0.865,0.563,0.377,0.222,0.747,0.181,0.777
2,0.575,0.207,0.628,0.444,0.855,0.393,0.074
3,0.281,0.162,0.619,0.333,0.223,0.107,0.409
4,0.007,0.152,0.59,0.278,0.018,0.047,0.155


**Calculate VIF for Multicolinearity**

In [185]:
def calculate_vif(data):
    """
    Calculate Variance Inflation Factor for each feature
    """
    vif_df = pd.DataFrame()
    vif_df['Column'] = data.columns
    vif_df['VIF'] = [variance_inflation_factor(data.values, i) 
                     for i in range(data.shape[1])]
    return vif_df

def reduce_vif(df, threshold=10.0):
    """
    Iteratively removes features with VIF above threshold
    
    Parameters:
        df: Input dataframe with numeric features
        threshold: Maximum allowed VIF
    
    Returns:
        Reduced columns and final VIF table
    """
    df_clean = df.copy()
    
    while True:
        vif_df = calculate_vif(df_clean)
        vif_df = vif_df.sort_values(by="VIF", ascending=False).reset_index(drop=True)

        max_vif = vif_df.loc[0, "VIF"]
        if max_vif > threshold:
            drop_col = vif_df.loc[0, "Column"]
            print(f"Dropping '{drop_col}' with VIF={max_vif:.2f}")
            df_clean = df_clean.drop(columns=[drop_col])
        else:
            break
    final_cols = list(df_clean.columns)
    return final_cols, vif_df

In [186]:
test_df_scaled.columns

Index(['Zip Code', 'Latitude', 'Longitude', 'Tenure Months', 'Monthly Charges',
       'Total Charges', 'CLTV'],
      dtype='object')

In [187]:
# Calculate initial VIF (excluding target variable)
initial_vif = calculate_vif(train_df_scaled)
print("Initial VIF values:")
initial_vif.sort_values(by='VIF', ascending=False)

Initial VIF values:


Unnamed: 0,Column,VIF
0,Zip Code,22.509
1,Latitude,20.417
5,Total Charges,17.785
3,Tenure Months,15.084
4,Monthly Charges,9.863
6,CLTV,5.784
2,Longitude,5.362


In [188]:
# Apply VIF reduction to remove multicollinearity
final_num_cols, final_vif = reduce_vif(train_df_scaled, threshold=10.0)
print(final_num_cols)
print(final_vif)

Dropping 'Zip Code' with VIF=22.51
Dropping 'Total Charges' with VIF=17.77
['Latitude', 'Longitude', 'Tenure Months', 'Monthly Charges', 'CLTV']
            Column   VIF
0             CLTV 5.784
1        Longitude 3.624
2  Monthly Charges 3.479
3    Tenure Months 3.422
4         Latitude 2.685


In [189]:
# Display final VIF values
print("Final VIF values (all <= 10):")
final_vif

Final VIF values (all <= 10):


Unnamed: 0,Column,VIF
0,CLTV,5.784
1,Longitude,3.624
2,Monthly Charges,3.479
3,Tenure Months,3.422
4,Latitude,2.685


## 2. Train Test Split

In [190]:
# selecting cols
print(f"Finalized numerical cols: {final_num_cols}")
print(f"Finalized categorical cols: {selected_cat_features_iv}")
target_feature = ['Churn Score']

# combine both lists
final_features = final_num_cols + selected_cat_features_iv + target_feature

train_df = train_df[final_features]
test_df = test_df[final_features]

train_df.head()

Finalized numerical cols: ['Latitude', 'Longitude', 'Tenure Months', 'Monthly Charges', 'CLTV']
Finalized categorical cols: ['City', 'Senior Citizen', 'Partner', 'Dependents', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method']


Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,CLTV,City,Senior Citizen,Partner,Dependents,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Churn Score
0,32.803,-117.027,71,98.65,5101,San Diego,No,Yes,No,Fiber optic,No,No,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),28
1,37.848,-120.487,16,93.2,5497,Jamestown,No,No,No,Fiber optic,No,Yes,Yes,No,No,Yes,Month-to-month,Yes,Credit card (automatic),75
2,34.505,-117.955,32,104.05,2335,Littlerock,No,No,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),60
3,34.08,-118.047,24,40.65,3843,El Monte,Yes,Yes,No,DSL,No,No,Yes,No,No,Yes,Month-to-month,Yes,Credit card (automatic),85
4,33.989,-118.334,20,20.05,2698,Los Angeles,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,43


In [191]:
# Train split
X_train = train_df.drop(columns=target_feature, axis=1)
y_train = train_df[target_feature[0]]  # returns Series

# Test split
X_test = test_df.drop(columns=target_feature, axis=1)
y_test = test_df[target_feature[0]]  # returns Series

print("Train-Test Split completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train churn rate: {y_train.mean():.3f}")
print(f"Test churn rate: {y_test.mean():.3f}")

Train-Test Split completed:
Training set: 5634 samples
Test set: 1409 samples
Train churn rate: 58.408
Test churn rate: 59.865


## 3. Feature Encoding

In [192]:
# Analyze unique values in each column
print("Unique values analysis:")
for col in X_train.columns:
    if X_train[col].nunique() <= 5:
        print(f"{col} -> {X_train[col].nunique()} -> {X_train[col].unique()}")
    else:
        print(f"{col} -> {X_train[col].nunique()}")

Unique values analysis:
Latitude -> 1652
Longitude -> 1651
Tenure Months -> 73
Monthly Charges -> 1500
CLTV -> 3117
City -> 1129
Senior Citizen -> 2 -> ['No' 'Yes']
Partner -> 2 -> ['Yes' 'No']
Dependents -> 2 -> ['No' 'Yes']
Internet Service -> 3 -> ['Fiber optic' 'DSL' 'No']
Online Security -> 3 -> ['No' 'No internet service' 'Yes']
Online Backup -> 3 -> ['No' 'Yes' 'No internet service']
Device Protection -> 3 -> ['Yes' 'No internet service' 'No']
Tech Support -> 3 -> ['No' 'No internet service' 'Yes']
Streaming TV -> 3 -> ['Yes' 'No' 'No internet service']
Streaming Movies -> 3 -> ['Yes' 'No internet service' 'No']
Contract -> 3 -> ['One year' 'Month-to-month' 'Two year']
Paperless Billing -> 2 -> ['Yes' 'No']
Payment Method -> 4 -> ['Credit card (automatic)' 'Mailed check' 'Electronic check'
 'Bank transfer (automatic)']


**Insights**
- For numerical features, we can use Min-Max scaling.
- Features with 2 unique values can be encoded using a Label Encoder.
- Features with more than 2 unique values can be encoded using Target Encoding.

### 3.1 Label Encoder

In [193]:
cols_to_label_encode = ['Senior Citizen', 'Partner', 'Dependents', 'Paperless Billing']

for col in cols_to_label_encode:
    X_train[col] = X_train[col].map({'Yes': 1, 'No': 0})
    X_test[col] = X_test[col].map({'Yes': 1, 'No': 0})

### 3.2 Min-Max Scaling

In [194]:
cols_to_scale = ['CLTV', 'Tenure Months', 'Monthly Charges', 'Longitude', 'Latitude']
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.fit_transform(X_test[cols_to_scale])
X_train.head()


Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,CLTV,City,Senior Citizen,Partner,Dependents,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method
0,0.026,0.72,0.986,0.801,0.689,San Diego,0,1,0,Fiber optic,No,No,Yes,No,Yes,Yes,One year,1,Credit card (automatic)
1,0.563,0.377,0.222,0.747,0.777,Jamestown,0,0,0,Fiber optic,No,Yes,Yes,No,No,Yes,Month-to-month,1,Credit card (automatic)
2,0.207,0.628,0.444,0.855,0.074,Littlerock,0,0,0,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,1,Credit card (automatic)
3,0.162,0.619,0.333,0.223,0.409,El Monte,1,1,0,DSL,No,No,Yes,No,No,Yes,Month-to-month,1,Credit card (automatic)
4,0.152,0.59,0.278,0.018,0.155,Los Angeles,0,1,1,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,0,Mailed check


### 3.3 Target Encoding

In [195]:
from category_encoders import TargetEncoder

cols_to_target_encode = ['City', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 
                  'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Payment Method']

encoder = TargetEncoder(cols=cols_to_target_encode)

X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test, y_test)

X_train_encoded.head()

Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,CLTV,City,Senior Citizen,Partner,Dependents,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method
0,0.026,0.72,0.986,0.801,0.689,59.675,0,1,0,63.511,63.52,62.488,56.952,63.296,59.549,59.769,53.821,1,55.133
1,0.563,0.377,0.222,0.747,0.777,60.567,0,0,0,63.511,63.52,57.263,56.952,63.296,60.79,59.769,63.687,1,55.133
2,0.207,0.628,0.444,0.855,0.074,58.757,0,0,0,63.511,63.52,57.263,56.952,63.296,59.549,59.769,53.821,1,55.133
3,0.162,0.619,0.333,0.223,0.409,60.137,1,1,0,55.963,63.52,62.488,56.952,63.296,60.79,59.769,63.687,1,55.133
4,0.152,0.59,0.278,0.018,0.155,60.786,0,1,1,52.152,52.152,52.152,52.152,52.152,52.152,52.152,63.687,0,55.995


**Correlation**

In [196]:
import pandas as pd

# Concatenate features and target temporarily
df_corr = pd.concat([X_train_encoded, y_train], axis=1)

# Compute correlation of all features with target
corr_with_target = df_corr.corr()[y_train.name].drop(y_train.name)

# Sort by absolute value
corr_with_target = corr_with_target.reindex(corr_with_target.abs().sort_values(ascending=False).index)

print("Feature correlations with Churn Score:")
print("-" * 50)
for feature, corr in corr_with_target.items():
    print(f"{feature:<40} | {corr:>7.4f}")

Feature correlations with Churn Score:
--------------------------------------------------
City                                     |  0.4043
Contract                                 |  0.2717
Online Security                          |  0.2364
Tech Support                             |  0.2246
Tenure Months                            | -0.2237
Internet Service                         |  0.2171
Device Protection                        |  0.1939
Payment Method                           |  0.1938
Online Backup                            |  0.1870
Dependents                               | -0.1701
Streaming TV                             |  0.1562
Streaming Movies                         |  0.1550
Paperless Billing                        |  0.1418
Monthly Charges                          |  0.1376
Partner                                  | -0.1097
Senior Citizen                           |  0.1091
CLTV                                     | -0.0791
Longitude                                | 

In [197]:
# Example threshold
threshold = 0.1   # keep only features with |corr| >= 0.1

# Filter correlations
corr_filtered = corr_with_target[abs(corr_with_target) >= threshold]

# Get the column names
selected_features = corr_filtered.index.tolist()
print("Selected features:", selected_features)

# Use only these columns in train/test
X_train_selected = X_train[selected_features]
X_test_selected  = X_test[selected_features]

print("X_train_selected shape:", X_train_selected.shape)
print("X_test_selected shape:", X_test_selected.shape)


Selected features: ['City', 'Contract', 'Online Security', 'Tech Support', 'Tenure Months', 'Internet Service', 'Device Protection', 'Payment Method', 'Online Backup', 'Dependents', 'Streaming TV', 'Streaming Movies', 'Paperless Billing', 'Monthly Charges', 'Partner', 'Senior Citizen']
X_train_selected shape: (5634, 16)
X_test_selected shape: (1409, 16)


## 4. Data Export

In [202]:
# If y was saved as a single column, converting to Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# Save datasets
X_train_selected.to_csv('artifacts/feature_engineering/X_train.csv', index=False)
X_test_selected.to_csv('artifacts/feature_engineering/X_test.csv', index=False)
y_train.to_csv('artifacts/feature_engineering/y_train.csv', index=False)
y_test.to_csv('artifacts/feature_engineering/y_test.csv', index=False)

# save features
feature_config = {
    "cols_to_target_encode": cols_to_target_encode,
    "cols_to_label_encode": cols_to_label_encode,
    "cols_to_scale": cols_to_scale,
    "target_feature": target_feature,
    "final_features": selected_features
}

with open("artifacts/feature_engineering/feature_config.json", "w") as f:
    json.dump(feature_config, f, indent=4)
    
# Save scaler 
joblib.dump(scaler, 'artifacts/feature_engineering/scaler.joblib')

# Save encoder
joblib.dump(encoder, 'artifacts/feature_engineering/encoder.joblib')

print("Data saved successfully:")
print("Feature congig saved successfully:")
print("Scaler saved successfully:")
print("Encoder saved successfully:")

Data saved successfully:
Feature congig saved successfully:
Scaler saved successfully:
Encoder saved successfully:
