# Telco Cusstomer Churn: Feature Engineering

In [125]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
RANDOM_STATE = 42

In [None]:
# Load dataset
df = pd.read_csv(r"C:\Users\linto\Code\churn-x\ml\notebooks\artifacts\eda\cleaned_df.csv")
print(f"Dataset shape: {df.shape}")

Dataset shape: (7043, 29)


In [128]:
# Display first 3 rows
df.head(3)

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV
0,United States,California,Los Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108,Yes,1,86,3239
1,United States,California,Los Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151,Yes,1,67,2701
2,United States,California,Los Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820,Yes,1,86,5372


In [129]:
# Analyze unique values in each column
print("Unique values analysis:")
for col in df.columns:
    if df[col].nunique() <= 5:
        print(f"{col} -> {df[col].nunique()} -> {df[col].unique()}")
    else:
        print(f"{col} -> {df[col].nunique()}")

Unique values analysis:
Country -> 1 -> ['United States']
State -> 1 -> ['California']
City -> 1129
Zip Code -> 1652
Latitude -> 1652
Longitude -> 1651
Gender -> 2 -> ['Male' 'Female']
Senior Citizen -> 2 -> ['No' 'Yes']
Partner -> 2 -> ['No' 'Yes']
Dependents -> 2 -> ['No' 'Yes']
Tenure Months -> 73
Phone Service -> 2 -> ['Yes' 'No']
Multiple Lines -> 3 -> ['No' 'Yes' 'No phone service']
Internet Service -> 3 -> ['DSL' 'Fiber optic' 'No']
Online Security -> 3 -> ['Yes' 'No' 'No internet service']
Online Backup -> 3 -> ['Yes' 'No' 'No internet service']
Device Protection -> 3 -> ['No' 'Yes' 'No internet service']
Tech Support -> 3 -> ['No' 'Yes' 'No internet service']
Streaming TV -> 3 -> ['No' 'Yes' 'No internet service']
Streaming Movies -> 3 -> ['No' 'Yes' 'No internet service']
Contract -> 3 -> ['Month-to-month' 'Two year' 'One year']
Paperless Billing -> 2 -> ['Yes' 'No']
Payment Method -> 4 -> ['Mailed check' 'Electronic check' 'Bank transfer (automatic)'
 'Credit card (automatic

In [130]:
# Data Preprocessing - Remove redundant and irrelevant columns
# Based on analysis:
# - Country and State have only one unique value
# - Churn Label is redundant with Churn Value  
# - Latitude/Longitude already captured by City
# - Total Charges = Tenure Months * Monthly Charges
# - Remove Zip Code and Gender to reduce bias

columns_to_drop = ['Country', 'State', 'Churn Label', 'Latitude', 'Longitude', 
                   'Total Charges', 'Zip Code', 'Gender']
df2 = df.drop(columns_to_drop, axis=1)
print(f"New dataset shape: {df2.shape}")

New dataset shape: (7043, 21)


In [131]:
# Display cleaned dataset
df2.head(3)

Unnamed: 0,City,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Churn Value,Churn Score,CLTV
0,Los Angeles,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,1,86,3239
1,Los Angeles,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,1,67,2701
2,Los Angeles,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,1,86,5372


In [132]:
# Target Encoding for City Feature
# Apply target encoding to capture city-churn relationship
city_mean_churn = df2.groupby('City')['Churn Value'].mean()

# Map encoding back to dataframe
df2['city_encoded'] = df2['City'].map(city_mean_churn)

# Check correlation with target
correlation = df2['city_encoded'].corr(df2['Churn Value'])
print(f"Correlation between city (encoded) and churn: {correlation:.4f}")

# Display encoding statistics
print(f"\nCity encoding statistics:")
print(f"Min churn rate: {df2['city_encoded'].min():.4f}")
print(f"Max churn rate: {df2['city_encoded'].max():.4f}")
print(f"Mean churn rate: {df2['city_encoded'].mean():.4f}")

Correlation between city (encoded) and churn: 0.4185

City encoding statistics:
Min churn rate: 0.0000
Max churn rate: 1.0000
Mean churn rate: 0.2654


In [133]:
# Feature Encoding - One-Hot Encoding for Categorical Variables
nominal_cols = ['Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 
                'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 
                'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 
                'Paperless Billing', 'Payment Method']

print(f"Applying one-hot encoding to {len(nominal_cols)} categorical columns")

Applying one-hot encoding to 15 categorical columns


In [134]:
# Drop City column and apply one-hot encoding
df2.drop('City', axis=1, inplace=True)
df3 = pd.get_dummies(df2, columns=nominal_cols, drop_first=True, dtype=int)
print(f"Shape after encoding: {df3.shape}")

Shape after encoding: (7043, 32)


In [135]:
# Display encoded dataset
df3.head(3)

Unnamed: 0,Tenure Months,Monthly Charges,Churn Value,Churn Score,CLTV,city_encoded,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Phone Service_Yes,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_Fiber optic,Internet Service_No,Online Security_No internet service,Online Security_Yes,Online Backup_No internet service,Online Backup_Yes,Device Protection_No internet service,Device Protection_Yes,Tech Support_No internet service,Tech Support_Yes,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,2,53.85,1,86,3239,0.295082,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,2,70.7,1,67,2701,0.295082,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,8,99.65,1,86,5372,0.295082,0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0


In [136]:
# Feature Selection - VIF Analysis Functions
def calculate_vif(data):
    """
    Calculate Variance Inflation Factor for each feature
    """
    vif_df = pd.DataFrame()
    vif_df['Column'] = data.columns
    vif_df['VIF'] = [variance_inflation_factor(data.values, i) 
                     for i in range(data.shape[1])]
    return vif_df

def reduce_vif(df, threshold=10.0):
    """
    Iteratively removes features with VIF above threshold
    
    Parameters:
        df: Input dataframe with numeric features
        threshold: Maximum allowed VIF
    
    Returns:
        Reduced dataframe and final VIF table
    """
    df_clean = df.copy()
    
    while True:
        vif_df = calculate_vif(df_clean)
        vif_df = vif_df.sort_values(by="VIF", ascending=False).reset_index(drop=True)
        
        max_vif = vif_df.loc[0, "VIF"]
        if max_vif > threshold:
            drop_col = vif_df.loc[0, "Column"]
            print(f"Dropping '{drop_col}' with VIF={max_vif:.2f}")
            df_clean = df_clean.drop(columns=[drop_col])
        else:
            break
    
    return df_clean, vif_df

print("VIF analysis functions defined")

VIF analysis functions defined


In [137]:
# Calculate initial VIF (excluding target variable)
initial_features = df3.drop('Churn Value', axis=1)
initial_vif = calculate_vif(initial_features)
print("Initial VIF values:")
initial_vif.sort_values(by='VIF', ascending=False)

Initial VIF values:


Unnamed: 0,Column,VIF
13,Online Security_No internet service,inf
12,Internet Service_No,inf
23,Streaming Movies_No internet service,inf
19,Tech Support_No internet service,inf
21,Streaming TV_No internet service,inf
17,Device Protection_No internet service,inf
15,Online Backup_No internet service,inf
8,Phone Service_Yes,1781.227626
1,Monthly Charges,863.223174
11,Internet Service_Fiber optic,148.353119


In [138]:
# Apply VIF reduction to remove multicollinearity
df4, final_vif = reduce_vif(df3.drop('Churn Value', axis=1), threshold=10.0)
print(f"\nFinal dataset shape: {df4.shape}")

Dropping 'Online Security_No internet service' with VIF=inf
Dropping 'Internet Service_No' with VIF=inf
Dropping 'Streaming Movies_No internet service' with VIF=inf
Dropping 'Tech Support_No internet service' with VIF=inf
Dropping 'Streaming TV_No internet service' with VIF=inf
Dropping 'Device Protection_No internet service' with VIF=inf
Dropping 'Phone Service_Yes' with VIF=1781.23
Dropping 'Monthly Charges' with VIF=93.80
Dropping 'CLTV' with VIF=12.14

Final dataset shape: (7043, 22)


In [139]:
# Display final VIF values
print("Final VIF values (all <= 10):")
final_vif

Final VIF values (all <= 10):


Unnamed: 0,Column,VIF
0,Tenure Months,7.460829
1,Churn Score,7.038049
2,Internet Service_Fiber optic,3.441484
3,Contract_Two year,3.432528
4,city_encoded,3.301218
5,Online Backup_No internet service,2.869935
6,Paperless Billing_Yes,2.824613
7,Streaming Movies_Yes,2.672181
8,Streaming TV_Yes,2.641071
9,Partner_Yes,2.584543


In [140]:
# Add target variable back and analyze correlations
df4['Churn Value'] = df3['Churn Value']

# Compute correlations with target variable
corr_with_target = df4.corr()['Churn Value'].drop('Churn Value').sort_values(key=abs, ascending=False)

print("Feature correlations with Churn Value:")
print("-" * 50)
for feature, corr in corr_with_target.items():
    print(f"{feature:<40} | {corr:>7.4f}")

Feature correlations with Churn Value:
--------------------------------------------------
Churn Score                              |  0.6649
city_encoded                             |  0.4185
Tenure Months                            | -0.3522
Internet Service_Fiber optic             |  0.3080
Contract_Two year                        | -0.3023
Payment Method_Electronic check          |  0.3019
Dependents_Yes                           | -0.2485
Online Backup_No internet service        | -0.2279
Paperless Billing_Yes                    |  0.1918
Contract_One year                        | -0.1778
Online Security_Yes                      | -0.1712
Tech Support_Yes                         | -0.1647
Senior Citizen_Yes                       |  0.1509
Partner_Yes                              | -0.1504
Payment Method_Credit card (automatic)   | -0.1343
Payment Method_Mailed check              | -0.0917
Online Backup_Yes                        | -0.0823
Device Protection_Yes                    | 

In [141]:
# Data Scaling - Normalize numerical features
# Separate features and target
X = df4.drop('Churn Value', axis=1)
y = df4['Churn Value']

# Scale numerical features (Tenure Months and city_encoded)
scaler = MinMaxScaler()
numerical_cols = ['Tenure Months', 'city_encoded']

X_scaled = X.copy()
X_scaled[numerical_cols] = scaler.fit_transform(X[numerical_cols])

print("Data scaling completed")
print(f"Features shape: {X_scaled.shape}")
print(f"Target shape: {y.shape}")

Data scaling completed
Features shape: (7043, 22)
Target shape: (7043,)


In [142]:
# Display scaled dataset statistics
print("Scaled dataset statistics:")
X_scaled.describe()

Scaled dataset statistics:


Unnamed: 0,Tenure Months,Churn Score,city_encoded,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_Fiber optic,Online Security_Yes,Online Backup_No internet service,Online Backup_Yes,Device Protection_Yes,Tech Support_Yes,Streaming TV_Yes,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.449599,58.699418,0.26537,0.162147,0.483033,0.23101,0.096834,0.421837,0.439585,0.286668,0.216669,0.344881,0.343888,0.290217,0.384353,0.387903,0.209144,0.240664,0.592219,0.216101,0.335794,0.22888
std,0.341104,21.525131,0.184796,0.368612,0.499748,0.421508,0.295752,0.493888,0.496372,0.452237,0.412004,0.475363,0.475038,0.453895,0.486477,0.487307,0.406726,0.427517,0.491457,0.411613,0.472301,0.420141
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.125,40.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.402778,61.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.763889,75.0,0.333333,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
max,1.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [143]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE, 
    stratify=y
)

print("Train-Test Split completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train churn rate: {y_train.mean():.3f}")
print(f"Test churn rate: {y_test.mean():.3f}")

Train-Test Split completed:
Training set: 5634 samples
Test set: 1409 samples
Train churn rate: 0.265
Test churn rate: 0.265


In [144]:
# Save processed datasets for modeling
import os

# Create directory if it doesn't exist
os.makedirs('telcoData', exist_ok=True)

# Save training data
train_data = X_train.copy()
train_data['Churn Value'] = y_train
train_data.to_csv('telcoData/train_data.csv', index=False)

# Save test data
test_data = X_test.copy()
test_data['Churn Value'] = y_test
test_data.to_csv('telcoData/test_data.csv', index=False)

# Save feature names for future reference
feature_names = X_scaled.columns.tolist()
pd.DataFrame({'features': feature_names}).to_csv('telcoData/feature_names.csv', index=False)

# Save scaler for future use
import joblib
joblib.dump(scaler, 'telcoData/scaler.pkl')

print("Data saved successfully:")
print("- telcoData/train_data.csv")
print("- telcoData/test_data.csv") 
print("- telcoData/feature_names.csv")
print("- telcoData/scaler.pkl")

Data saved successfully:
- telcoData/train_data.csv
- telcoData/test_data.csv
- telcoData/feature_names.csv
- telcoData/scaler.pkl


In [145]:
# Final dataset summary
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"Original dataset: {df.shape}")
print(f"After cleaning: {df2.shape}")
print(f"After encoding: {df3.shape}")
print(f"After VIF reduction: {df4.shape}")
print(f"Final features: {len(X_scaled.columns)}")
print(f"\nFinal feature list:")
for i, feature in enumerate(X_scaled.columns, 1):
    print(f"{i:2d}. {feature}")

=== FEATURE ENGINEERING SUMMARY ===
Original dataset: (7043, 29)
After cleaning: (7043, 21)
After encoding: (7043, 32)
After VIF reduction: (7043, 23)
Final features: 22

Final feature list:
 1. Tenure Months
 2. Churn Score
 3. city_encoded
 4. Senior Citizen_Yes
 5. Partner_Yes
 6. Dependents_Yes
 7. Multiple Lines_No phone service
 8. Multiple Lines_Yes
 9. Internet Service_Fiber optic
10. Online Security_Yes
11. Online Backup_No internet service
12. Online Backup_Yes
13. Device Protection_Yes
14. Tech Support_Yes
15. Streaming TV_Yes
16. Streaming Movies_Yes
17. Contract_One year
18. Contract_Two year
19. Paperless Billing_Yes
20. Payment Method_Credit card (automatic)
21. Payment Method_Electronic check
22. Payment Method_Mailed check
