# Preprocessing of Customer Churn

## Preprocessing and Modeling Objective

This notebook prepares features for machine learning,
trains baseline models, and evaluates initial performance.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
df = pd.read_csv('../data/processed/churn_cleaned.csv')

df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

In [3]:
# One-Hot Encoding (For Multi-category Columns)
cat_features = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                'Contract', 'PaymentMethod']

df_final = pd.get_dummies(df, columns=cat_features, drop_first=True)

# Identify numerical columns
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Convert all Boolean columns to Integers (0 and 1)
bool_cols = df_final.select_dtypes(include='bool').columns
df_final[bool_cols] = df_final[bool_cols].astype(int)

In [4]:
print(f"New shape after encoding: {df_final.shape}")
df_final.head()

New shape after encoding: (7043, 25)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,1,29.85,29.85,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,34,1,0,56.95,1889.5,0,...,0,1,0,0,0,1,0,0,0,1
2,0,0,0,0,2,1,1,53.85,108.15,1,...,1,0,0,0,0,0,0,0,0,1
3,0,0,0,0,45,0,0,42.3,1840.75,0,...,0,1,1,0,0,1,0,0,0,0
4,1,0,0,0,2,1,1,70.7,151.65,1,...,0,0,0,0,0,0,0,0,1,0


In [5]:
X = df_final.drop('Churn', axis=1)
y = df_final['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training shape: {X_train.shape}")
print(f"Testing shape: {X_test.shape}")

Training shape: (5634, 24)
Testing shape: (1409, 24)


In [6]:
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [7]:
df_final.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,1,29.85,29.85,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,34,1,0,56.95,1889.5,0,...,0,1,0,0,0,1,0,0,0,1
2,0,0,0,0,2,1,1,53.85,108.15,1,...,1,0,0,0,0,0,0,0,0,1
3,0,0,0,0,45,0,0,42.3,1840.75,0,...,0,1,1,0,0,1,0,0,0,0
4,1,0,0,0,2,1,1,70.7,151.65,1,...,0,0,0,0,0,0,0,0,1,0


## Class Imbalance Handling

In [8]:
from collections import Counter
Counter(y_train)

Counter({0: 4139, 1: 1495})

In [9]:
log_model = LogisticRegression(class_weight='balanced', max_iter=1000)

In [10]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42
)

### Initialize and Train the model

In [11]:
log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)

print("--- Logistic Regression Performance ---")
print(classification_report(y_test, y_pred))

--- Logistic Regression Performance ---
              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1035
           1       0.51      0.79      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.75      1409



In [12]:
cv_scores = cross_val_score(
    log_model, X_train, y_train, cv=5, scoring='roc_auc'
)

cv_scores.mean()

np.float64(0.8452659308217125)

### Training Random Forest Model

In [13]:
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("--- Random Forest Performance ---")
print(classification_report(y_test, y_pred_rf))

--- Random Forest Performance ---
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [14]:
cv_scores = cross_val_score(
    rf_model, X_train, y_train, cv=5, scoring='roc_auc'
)

cv_scores.mean()

np.float64(0.8253824549099452)

In [15]:
importances = pd.Series(
    rf_model.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

importances.head(10)

TotalCharges                      0.178501
MonthlyCharges                    0.166790
tenure                            0.161840
Contract_Two year                 0.060501
InternetService_Fiber optic       0.049248
PaymentMethod_Electronic check    0.035419
Contract_One year                 0.030656
InternetService_No                0.027255
OnlineSecurity_Yes                0.026307
gender                            0.026231
dtype: float64

In [17]:
import joblib

joblib.dump(log_model, "../models/logistic_model.pkl")
joblib.dump(rf_model, "../models/rf_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

In [18]:
df_final.to_csv('../data/processed/telco_final_processed.csv', index=False)

## Modeling Summary

Baseline models show promising performance.
Further optimization and advanced models
will be explored in the next phase.