# Telco Cusstomer Churn: Feature Engineering

This notebook performs feature engineering on the cleaned Telco Customer Churn dataset.

## Table of Contents 
1. [Feature Selection](#1-feature-selection)  
2. [Test-Train Split](#2-test-train-split)  
3. [Feature Encoding](#3-feature-encoding) 
4. [Data Export](#4-data-export)  

## Configure Settings and Data Loading

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import json
import seaborn as sns
import numpy as np
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
RANDOM_STATE = 42

In [2]:
# Load dataset
test_df = pd.read_csv(r"C:\Users\linto\Code\churn-x\ml\notebooks\artifacts\eda\test_df.csv")
train_df = pd.read_csv(r"C:\Users\linto\Code\churn-x\ml\notebooks\artifacts\eda\train_df.csv")
print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

Train dataset shape: (5587, 30)
Test dataset shape: (1397, 30)


In [3]:
# Display first 3 rows 
train_df.head(3)

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,United States,California,Carlsbad,92008,33.148116,-117.306043,Male,Yes,Yes,No,33,No,No phone service,DSL,No,Yes,Yes,Yes,Yes,No,Month-to-month,Yes,Electronic check,50.0,1750,No,0,65,5600,
1,United States,California,Stockton,95203,37.954089,-121.329761,Male,No,No,No,57,Yes,No,DSL,Yes,No,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),74.35,4317,No,0,71,5548,
2,United States,California,Tarzana,91356,34.157137,-118.548511,Male,Yes,No,No,48,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,One year,Yes,Electronic check,99.0,4744,No,0,78,3061,


## 1. Feature Section

**Create new features**

In [4]:
cols_to_count = [ "Online Security","Online Backup", "Device Protection", "Tech Support"]

# Create new column counting everything except 'No' and 'No internet service'
train_df["additional_services"] = train_df[cols_to_count].apply(
    lambda x: (~x.isin(['No', 'No internet service', 'No phone service'])).sum(), axis=1
).astype(int)
test_df["additional_services"] = test_df[cols_to_count].apply(
    lambda x: (~x.isin(['No', 'No internet service', 'No phone service'])).sum(), axis=1
).astype(int)
train_df.head()

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason,additional_services
0,United States,California,Carlsbad,92008,33.148116,-117.306043,Male,Yes,Yes,No,33,No,No phone service,DSL,No,Yes,Yes,Yes,Yes,No,Month-to-month,Yes,Electronic check,50.0,1750,No,0,65,5600,,3
1,United States,California,Stockton,95203,37.954089,-121.329761,Male,No,No,No,57,Yes,No,DSL,Yes,No,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),74.35,4317,No,0,71,5548,,2
2,United States,California,Tarzana,91356,34.157137,-118.548511,Male,Yes,No,No,48,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,One year,Yes,Electronic check,99.0,4744,No,0,78,3061,,1
3,United States,California,Madera,93638,37.004068,-119.930027,Female,Yes,Yes,No,68,Yes,Yes,Fiber optic,Yes,Yes,No,No,No,Yes,One year,Yes,Credit card (automatic),96.55,6581,Yes,1,86,5957,Attitude of service provider,2
4,United States,California,Hamilton City,95951,39.732767,-122.042298,Male,No,Yes,No,59,Yes,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,Yes,Mailed check,69.1,4096,No,0,29,4765,,2


In [5]:
cols_to_count = ["Internet Service", "Online Security",
    "Online Backup", "Device Protection", "Tech Support",
    "Streaming TV", "Streaming Movies"
]

# Create new column counting everything except 'No' and 'No internet service'
train_df["total_services"] = train_df[cols_to_count].apply(
    lambda x: (~x.isin(['No', 'No internet service', 'No phone service'])).sum(), axis=1
).astype(int)
test_df["total_services"] = test_df[cols_to_count].apply(
    lambda x: (~x.isin(['No', 'No internet service', 'No phone service'])).sum(), axis=1
).astype(int)
train_df.head()

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason,additional_services,total_services
0,United States,California,Carlsbad,92008,33.148116,-117.306043,Male,Yes,Yes,No,33,No,No phone service,DSL,No,Yes,Yes,Yes,Yes,No,Month-to-month,Yes,Electronic check,50.0,1750,No,0,65,5600,,3,5
1,United States,California,Stockton,95203,37.954089,-121.329761,Male,No,No,No,57,Yes,No,DSL,Yes,No,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),74.35,4317,No,0,71,5548,,2,5
2,United States,California,Tarzana,91356,34.157137,-118.548511,Male,Yes,No,No,48,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,One year,Yes,Electronic check,99.0,4744,No,0,78,3061,,1,4
3,United States,California,Madera,93638,37.004068,-119.930027,Female,Yes,Yes,No,68,Yes,Yes,Fiber optic,Yes,Yes,No,No,No,Yes,One year,Yes,Credit card (automatic),96.55,6581,Yes,1,86,5957,Attitude of service provider,2,4
4,United States,California,Hamilton City,95951,39.732767,-122.042298,Male,No,Yes,No,59,Yes,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,Yes,Mailed check,69.1,4096,No,0,29,4765,,2,4


In [6]:
cols_to_count = [
    "Streaming TV", "Streaming Movies"
]

# Create new column counting everything except 'No' and 'No internet service'
train_df["3rd_party_services"] = train_df[cols_to_count].apply(
    lambda x: (~x.isin(['No', 'No internet service', 'No phone service'])).sum(), axis=1
).astype(int)
test_df["3rd_party_services"] = test_df[cols_to_count].apply(
    lambda x: (~x.isin(['No', 'No internet service', 'No phone service'])).sum(), axis=1
).astype(int)
train_df.head()

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason,additional_services,total_services,3rd_party_services
0,United States,California,Carlsbad,92008,33.148116,-117.306043,Male,Yes,Yes,No,33,No,No phone service,DSL,No,Yes,Yes,Yes,Yes,No,Month-to-month,Yes,Electronic check,50.0,1750,No,0,65,5600,,3,5,1
1,United States,California,Stockton,95203,37.954089,-121.329761,Male,No,No,No,57,Yes,No,DSL,Yes,No,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),74.35,4317,No,0,71,5548,,2,5,2
2,United States,California,Tarzana,91356,34.157137,-118.548511,Male,Yes,No,No,48,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,One year,Yes,Electronic check,99.0,4744,No,0,78,3061,,1,4,2
3,United States,California,Madera,93638,37.004068,-119.930027,Female,Yes,Yes,No,68,Yes,Yes,Fiber optic,Yes,Yes,No,No,No,Yes,One year,Yes,Credit card (automatic),96.55,6581,Yes,1,86,5957,Attitude of service provider,2,4,1
4,United States,California,Hamilton City,95951,39.732767,-122.042298,Male,No,Yes,No,59,Yes,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,Yes,Mailed check,69.1,4096,No,0,29,4765,,2,4,1


In [7]:
train_df['Payment Method'].unique()

array(['Electronic check', 'Bank transfer (automatic)',
       'Credit card (automatic)', 'Mailed check'], dtype=object)

In [8]:
# Create binary column: 1 if 'Payment Method' contains 'Automatic', else 0
train_df['is_payment_automatic'] = train_df['Payment Method'].str.contains('Automatic', case=False, na=False).astype(int)
test_df['is_payment_automatic'] = test_df['Payment Method'].str.contains('Automatic', case=False, na=False).astype(int)

train_df[['Payment Method', 'is_payment_automatic']].head()


Unnamed: 0,Payment Method,is_payment_automatic
0,Electronic check,0
1,Bank transfer (automatic),1
2,Electronic check,0
3,Credit card (automatic),1
4,Mailed check,0


In [9]:
train_df['Contract'].unique()

array(['Month-to-month', 'Two year', 'One year'], dtype=object)

In [10]:
train_df['is_payment_recurring'] = train_df['Contract'].map({"Month-to-month": 1, "One year": 0, "Two year": 0}).astype(int)
test_df['is_payment_recurring'] = test_df['Contract'].map({"Month-to-month": 1, "One year": 0, "Two year": 0}).astype(int)

In [11]:
train_df['Internet Service'].unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

**Remove unwanted features**

In [12]:
train_df.drop(['Country', 'State', 'Zip Code', 'Latitude', 'Longitude', 'Churn Label', 'Churn Score', 'Churn Reason', 'Gender'], axis=1, inplace=True)
test_df.drop(['Country', 'State', 'Zip Code', 'Latitude', 'Longitude', 'Churn Label', 'Churn Score', 'Churn Reason', 'Gender'], axis=1, inplace=True)

### 1.1 Categorical Features

In [13]:
cat_cols = train_df.select_dtypes(include=["object", "category"]).columns.tolist()
cat_cols

['City',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method']

**Calculate WoE and IV**

In [14]:
def calculate_woe_iv(df, feature, target):
    grouped = df.groupby(feature)[target].agg(['count','sum'])
    grouped = grouped.rename(columns={'count': 'total', 'sum': 'good'})
    grouped['bad']=grouped['total']-grouped['good']
    
    total_good = grouped['good'].sum()
    total_bad = grouped['bad'].sum()
    
    grouped['good_pct'] = grouped['good'] / total_good
    grouped['bad_pct'] = grouped['bad'] / total_bad
    grouped['woe'] = np.log(grouped['good_pct']/ grouped['bad_pct'])
    grouped['iv'] = (grouped['good_pct'] -grouped['bad_pct'])*grouped['woe']
    
    grouped['woe'] = grouped['woe'].replace([np.inf, -np.inf], 0)
    grouped['iv'] = grouped['iv'].replace([np.inf, -np.inf], 0)
    
    total_iv = grouped['iv'].sum()
    
    return grouped, total_iv

grouped, total_iv = calculate_woe_iv(train_df, 'Contract', 'Churn Value')
grouped

Unnamed: 0_level_0,total,good,bad,good_pct,bad_pct,woe,iv
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Month-to-month,3077,1271,1806,0.89193,0.433926,0.720513,0.329998
One year,1161,116,1045,0.081404,0.251081,-1.126358,0.191118
Two year,1349,38,1311,0.026667,0.314993,-2.469135,0.711916


In [15]:
iv_values = {}

for feature in cat_cols:
    _, iv = calculate_woe_iv(train_df, feature, 'Churn Value')
    iv_values[feature] = iv
        
iv_values

{'City': np.float64(0.4358123675269554),
 'Senior Citizen': np.float64(0.1234096506359256),
 'Partner': np.float64(0.10597127662123132),
 'Dependents': np.float64(0.47145806968662574),
 'Phone Service': np.float64(0.003044627587229731),
 'Multiple Lines': np.float64(0.011386114714116317),
 'Internet Service': np.float64(0.6210727082488896),
 'Online Security': np.float64(0.682229496267713),
 'Online Backup': np.float64(0.500367758238851),
 'Device Protection': np.float64(0.48984990516674803),
 'Tech Support': np.float64(0.7030849276475475),
 'Streaming TV': np.float64(0.36629361060035126),
 'Streaming Movies': np.float64(0.3662092826615379),
 'Contract': np.float64(1.2330320112372903),
 'Paperless Billing': np.float64(0.20442025017381943),
 'Payment Method': np.float64(0.44176684731338867)}

In [16]:
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

iv_df = pd.DataFrame(list(iv_values.items()), columns=['Feature', 'IV'])
iv_df = iv_df.sort_values(by='IV', ascending=False)
iv_df

Unnamed: 0,Feature,IV
13,Contract,1.233
10,Tech Support,0.703
7,Online Security,0.682
6,Internet Service,0.621
8,Online Backup,0.5
9,Device Protection,0.49
3,Dependents,0.471
15,Payment Method,0.442
0,City,0.436
11,Streaming TV,0.366


In [17]:
# select features that has IV > 0.02
selected_cat_features_iv = [feature for feature, iv in iv_values.items() if iv > 0.02]
selected_cat_features_iv

['City',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method']

### 1.2 Numerical Columns

In [18]:
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('Churn Value')
print("Numerical columns:", num_cols)

Numerical columns: ['Tenure Months', 'Monthly Charges', 'Total Charges', 'CLTV', 'additional_services', 'total_services', '3rd_party_services', 'is_payment_automatic', 'is_payment_recurring']


**Scaling**

In [19]:
scaler = MinMaxScaler()

X_train_scaled = pd.DataFrame(
    scaler.fit_transform(train_df[num_cols]),
    columns=num_cols,
    index=train_df.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(test_df[num_cols]),
    columns=num_cols,
    index=test_df.index
)

X_train_scaled.head()


Unnamed: 0,Tenure Months,Monthly Charges,Total Charges,CLTV,additional_services,total_services,3rd_party_services,is_payment_automatic,is_payment_recurring
0,0.458,0.316,0.202,0.8,0.75,0.714,0.5,0.0,1.0
1,0.792,0.558,0.497,0.788,0.5,0.714,1.0,1.0,0.0
2,0.667,0.803,0.546,0.235,0.25,0.571,1.0,0.0,0.0
3,0.944,0.779,0.758,0.879,0.5,0.571,0.5,1.0,0.0
4,0.819,0.506,0.472,0.614,0.5,0.571,0.5,0.0,0.0


**Calculate VIF for Multicolinearity**

In [20]:
def calculate_vif(data):
    """
    Calculate Variance Inflation Factor for each feature
    """
    vif_df = pd.DataFrame()
    vif_df['Column'] = data.columns
    vif_df['VIF'] = [variance_inflation_factor(data.values, i) 
                     for i in range(data.shape[1])]
    return vif_df

def reduce_vif(df, threshold=10.0):
    """
    Iteratively removes features with VIF above threshold
    
    Parameters:
        df: Input dataframe with numeric features
        threshold: Maximum allowed VIF
    
    Returns:
        Reduced columns and final VIF table
    """
    df_clean = df.copy()
    
    while True:
        vif_df = calculate_vif(df_clean)
        vif_df = vif_df.sort_values(by="VIF", ascending=False).reset_index(drop=True)

        max_vif = vif_df.loc[0, "VIF"]
        if max_vif > threshold:
            drop_col = vif_df.loc[0, "Column"]
            print(f"Dropping '{drop_col}' with VIF={max_vif:.2f}")
            df_clean = df_clean.drop(columns=[drop_col])
        else:
            break
    final_cols = list(df_clean.columns)
    return final_cols, vif_df

In [21]:
X_train_scaled.columns

Index(['Tenure Months', 'Monthly Charges', 'Total Charges', 'CLTV',
       'additional_services', 'total_services', '3rd_party_services',
       'is_payment_automatic', 'is_payment_recurring'],
      dtype='object')

In [22]:
# Calculate initial VIF (excluding target variable)
initial_vif = calculate_vif(X_train_scaled)
print("Initial VIF values:")
initial_vif.sort_values(by='VIF', ascending=False)

Initial VIF values:


Unnamed: 0,Column,VIF
5,total_services,214.808
4,additional_services,74.084
6,3rd_party_services,25.159
1,Monthly Charges,20.585
2,Total Charges,16.682
0,Tenure Months,12.284
3,CLTV,5.01
8,is_payment_recurring,3.614
7,is_payment_automatic,2.108


In [23]:
# Apply VIF reduction to remove multicollinearity
final_num_cols, final_vif = reduce_vif(X_train_scaled, threshold=10.0)
print(final_num_cols)
print(final_vif)

Dropping 'total_services' with VIF=214.81
Dropping 'Total Charges' with VIF=15.40
['Tenure Months', 'Monthly Charges', 'CLTV', 'additional_services', '3rd_party_services', 'is_payment_automatic', 'is_payment_recurring']
                 Column   VIF
0       Monthly Charges 9.436
1         Tenure Months 5.584
2                  CLTV 4.769
3    3rd_party_services 3.974
4   additional_services 3.786
5  is_payment_recurring 2.716
6  is_payment_automatic 2.078


In [24]:
# Display final VIF values
print("Final VIF values (all <= 10):")
final_vif

Final VIF values (all <= 10):


Unnamed: 0,Column,VIF
0,Monthly Charges,9.436
1,Tenure Months,5.584
2,CLTV,4.769
3,3rd_party_services,3.974
4,additional_services,3.786
5,is_payment_recurring,2.716
6,is_payment_automatic,2.078


In [25]:
target_feature = ['Churn Value']
final_features = final_num_cols + selected_cat_features_iv + target_feature

train_df = train_df[final_features]
test_df = test_df[final_features]

## 2. Train Test Split

In [26]:
# Train split
X_train = train_df.drop('Churn Value', axis=1)
y_train = train_df['Churn Value']  # returns Series

# Test split
X_test = test_df.drop(columns='Churn Value', axis=1)
y_test = test_df['Churn Value']  # returns Series

print("Train-Test Split completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train churn rate: {y_train.mean():.3f}")
print(f"Test churn rate: {y_test.mean():.3f}")

Train-Test Split completed:
Training set: 5587 samples
Test set: 1397 samples
Train churn rate: 0.255
Test churn rate: 0.276


## 3. Feature Encoding

In [27]:
# Analyze unique values in each column
print("Unique values analysis:")
for col in X_train.columns:
    if X_train[col].nunique() <= 5:
        print(f"{col} -> {X_train[col].nunique()} -> {X_train[col].unique()}")
    else:
        print(f"{col} -> {X_train[col].nunique()}")

Unique values analysis:
Tenure Months -> 73
Monthly Charges -> 1494
CLTV -> 3104
additional_services -> 5 -> [3 2 1 0 4]
3rd_party_services -> 3 -> [1 2 0]
is_payment_automatic -> 2 -> [0 1]
is_payment_recurring -> 2 -> [1 0]
City -> 1127
Senior Citizen -> 2 -> ['Yes' 'No']
Partner -> 2 -> ['Yes' 'No']
Dependents -> 2 -> ['No' 'Yes']
Internet Service -> 3 -> ['DSL' 'Fiber optic' 'No']
Online Security -> 3 -> ['No' 'Yes' 'No internet service']
Online Backup -> 3 -> ['Yes' 'No' 'No internet service']
Device Protection -> 3 -> ['Yes' 'No' 'No internet service']
Tech Support -> 3 -> ['Yes' 'No' 'No internet service']
Streaming TV -> 3 -> ['Yes' 'No' 'No internet service']
Streaming Movies -> 3 -> ['No' 'Yes' 'No internet service']
Contract -> 3 -> ['Month-to-month' 'Two year' 'One year']
Paperless Billing -> 2 -> ['Yes' 'No']
Payment Method -> 4 -> ['Electronic check' 'Bank transfer (automatic)' 'Credit card (automatic)'
 'Mailed check']


**Insights**
- For numerical features, we can use Min-Max scaling.
- Features with 2 unique values can be encoded using a Label Encoder.
- Features with more than 2 unique values can be encoded using Target Encoding.

### 3.1 Label Encoder

In [28]:
cols_to_label_encode = ['Senior Citizen', 'Partner', 'Dependents', 'Paperless Billing']

for col in cols_to_label_encode:
    X_train[col] = X_train[col].map({'Yes': 1, 'No': 0})
    X_test[col] = X_test[col].map({'Yes': 1, 'No': 0})

### 3.2 Min-Max Scaling

In [29]:
cols_to_scale = ['CLTV', 'Tenure Months', 'Monthly Charges', 'additional_services', '3rd_party_services']
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.fit_transform(X_test[cols_to_scale])
X_train.head()


Unnamed: 0,Tenure Months,Monthly Charges,CLTV,additional_services,3rd_party_services,is_payment_automatic,is_payment_recurring,City,Senior Citizen,Partner,Dependents,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method
0,0.458,0.316,0.8,0.75,0.5,0,1,Carlsbad,1,1,0,DSL,No,Yes,Yes,Yes,Yes,No,Month-to-month,1,Electronic check
1,0.792,0.558,0.788,0.5,1.0,1,0,Stockton,0,0,0,DSL,Yes,No,Yes,No,Yes,Yes,Two year,1,Bank transfer (automatic)
2,0.667,0.803,0.235,0.25,1.0,0,0,Tarzana,1,0,0,Fiber optic,No,Yes,No,No,Yes,Yes,One year,1,Electronic check
3,0.944,0.779,0.879,0.5,0.5,1,0,Madera,1,1,0,Fiber optic,Yes,Yes,No,No,No,Yes,One year,1,Credit card (automatic)
4,0.819,0.506,0.614,0.5,0.5,0,0,Hamilton City,0,1,0,DSL,Yes,No,No,Yes,No,Yes,One year,1,Mailed check


### 3.3 Target Encoding

In [30]:
from category_encoders import TargetEncoder

cols_to_target_encode = ['City', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 
                  'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Payment Method']

encoder = TargetEncoder(cols=cols_to_target_encode)

X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test, y_test)

X_train_encoded.head()

Unnamed: 0,Tenure Months,Monthly Charges,CLTV,additional_services,3rd_party_services,is_payment_automatic,is_payment_recurring,City,Senior Citizen,Partner,Dependents,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method
0,0.458,0.316,0.8,0.75,0.5,0,1,0.302,1,1,0,0.177,0.399,0.21,0.213,0.139,0.288,0.323,0.413,1,0.438
1,0.792,0.558,0.788,0.5,1.0,1,0,0.251,0,0,0,0.177,0.142,0.382,0.213,0.404,0.288,0.288,0.028,1,0.154
2,0.667,0.803,0.235,0.25,1.0,0,0,0.254,1,0,0,0.409,0.399,0.21,0.378,0.404,0.288,0.288,0.1,1,0.438
3,0.944,0.779,0.879,0.5,0.5,1,0,0.271,1,1,0,0.409,0.142,0.21,0.378,0.404,0.323,0.288,0.1,1,0.154
4,0.819,0.506,0.614,0.5,0.5,0,0,0.267,0,1,0,0.177,0.142,0.382,0.378,0.139,0.323,0.288,0.1,1,0.186


**Correlation**

In [31]:
import pandas as pd

# Concatenate features and target temporarily
df_corr = pd.concat([X_train_encoded, y_train], axis=1)

# Compute correlation of all features with target
corr_with_target = df_corr.corr()[y_train.name].drop(y_train.name)

# Sort by absolute value
corr_with_target = corr_with_target.reindex(corr_with_target.abs().sort_values(ascending=False).index)

print("Feature correlations with Churn Score:")
print("-" * 50)
for feature, corr in corr_with_target.items():
    print(f"{feature:<40} | {corr:>7.4f}")

Feature correlations with Churn Score:
--------------------------------------------------
City                                     |  0.4068
Contract                                 |  0.4051
is_payment_recurring                     |  0.4014
Tenure Months                            | -0.3446
Tech Support                             |  0.3405
Online Security                          |  0.3345
Internet Service                         |  0.3213
Payment Method                           |  0.2955
Online Backup                            |  0.2806
Device Protection                        |  0.2766
Dependents                               | -0.2469
Streaming TV                             |  0.2237
Streaming Movies                         |  0.2236
is_payment_automatic                     | -0.2045
Monthly Charges                          |  0.1946
Paperless Billing                        |  0.1897
additional_services                      | -0.1704
Senior Citizen                           | 

In [32]:
# Example threshold
threshold = 0.1   # keep only features with |corr| >= 0.1

# Filter correlations
corr_filtered = corr_with_target[abs(corr_with_target) >= threshold]

# Get the column names
selected_features = corr_filtered.index.tolist()
print("Selected features:", selected_features)

# Use only these columns in train/test
X_train_selected = X_train_encoded[selected_features]
X_test_selected  = X_test_encoded[selected_features]

print("X_train_selected shape:", X_train_selected.shape)
print("X_test_selected shape:", X_test_selected.shape)


Selected features: ['City', 'Contract', 'is_payment_recurring', 'Tenure Months', 'Tech Support', 'Online Security', 'Internet Service', 'Payment Method', 'Online Backup', 'Device Protection', 'Dependents', 'Streaming TV', 'Streaming Movies', 'is_payment_automatic', 'Monthly Charges', 'Paperless Billing', 'additional_services', 'Senior Citizen', 'Partner', 'CLTV']
X_train_selected shape: (5587, 20)
X_test_selected shape: (1397, 20)


## 4. Data Export

In [33]:
# If y was saved as a single column, converting to Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# Save datasets
X_train_selected.to_csv('artifacts/feature_engineering/X_train.csv', index=False)
X_test_selected.to_csv('artifacts/feature_engineering/X_test.csv', index=False)
y_train.to_csv('artifacts/feature_engineering/y_train.csv', index=False)
y_test.to_csv('artifacts/feature_engineering/y_test.csv', index=False)

# save features
feature_config = {
    "cols_to_target_encode": cols_to_target_encode,
    "cols_to_label_encode": cols_to_label_encode,
    "cols_to_scale": cols_to_scale,
    "target_feature": target_feature,
    "final_features": selected_features
}

with open("artifacts/feature_engineering/feature_config.json", "w") as f:
    json.dump(feature_config, f, indent=4)
    
# Save scaler 
joblib.dump(scaler, 'artifacts/feature_engineering/scaler.joblib')

# Save encoder
joblib.dump(encoder, 'artifacts/feature_engineering/encoder.joblib')

['artifacts/feature_engineering/encoder.joblib']