In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
import joblib



In [2]:
# Load data
df = pd.read_csv('../data/Telco-Customer-Churn.csv')



In [3]:
# 1. Handle missing values (null values)
print("Missing values:\n", df.isnull().sum())



Missing values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [4]:
# Checking colums with missing values as spaces
space_counts = df.apply(lambda col: (col.astype(str).str.strip() == "").sum())
space_counts


customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [5]:
# TotalCharges has some missing values (as spaces)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [6]:
# Getting the columns
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
# 2. Drop irrelevant columns
df.drop(['customerID'], axis=1, inplace=True)



In [8]:
# Checking for dupplicates
# previous dimension (7043, 21)
df = df.drop_duplicates()
print(df.shape)

(7021, 20)


In [None]:
# 3. Encode target variable
# df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})



In [9]:
# 4. Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn']



In [10]:
print(X.sample(2))


      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
4654  Female              0     Yes        Yes      42          Yes   
4698  Female              1     Yes         No      66          Yes   

     MultipleLines InternetService       OnlineSecurity         OnlineBackup  \
4654            No              No  No internet service  No internet service   
4698           Yes             DSL                   No                  Yes   

         DeviceProtection          TechSupport          StreamingTV  \
4654  No internet service  No internet service  No internet service   
4698                  Yes                   No                  Yes   

          StreamingMovies  Contract PaperlessBilling  \
4654  No internet service  One year              Yes   
4698                  Yes  Two year              Yes   

                  PaymentMethod  MonthlyCharges  TotalCharges  
4654           Electronic check           19.05        761.85  
4698  Bank transfer (automatic)   

In [11]:
print(y.sample(2))


4232    Yes
1772    Yes
Name: Churn, dtype: object


In [None]:
# 5. Encode categorical variables
# Get categorical columns
# categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
# print(f"Categorical columns: {categorical_cols}")



In [None]:
# from sklearn.preprocessing import OrdinalEncoder
# ordinal encode input variables
# ordinal_encoder = OrdinalEncoder()
# ordinal_encoder.fit(categorical_cols)
# x_encoded = ordinal_encoder.fit_transform(X[categorical_cols])
# print(x_encoded)

In [None]:
# One-hot encoding
# X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# print(X_encoded.head())


In [12]:
# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")



Training set: (5616, 19)
Test set: (1405, 19)


In [13]:
print(y_train.sample(2))
print(y_test.sample(2))

6874    No
4410    No
Name: Churn, dtype: object
5085    No
3309    No
Name: Churn, dtype: object


In [14]:
# 6. Encode categorical variables
# Get categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [15]:
# ordinal encode categorical variables
ordinal_encoder = OrdinalEncoder()
# ordinal_encoder.fit(X_train[categorical_cols])
X_train[categorical_cols] = ordinal_encoder.fit_transform(X_train[categorical_cols]) 
X_test[categorical_cols] = ordinal_encoder.transform(X_test[categorical_cols])
# X_train = ordinal_encoder.transform(X_train)
# X_test = ordinal_encoder.transform(X_test)

In [16]:
# 7. Scale numerical variables
# Get numerical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']



In [17]:
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])



In [18]:
print(X_train.sample(1))

      gender  SeniorCitizen  Partner  Dependents    tenure  PhoneService  \
6604     1.0              0      1.0         1.0 -1.241331           1.0   

      MultipleLines  InternetService  OnlineSecurity  OnlineBackup  \
6604            0.0              0.0             0.0           0.0   

      DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
6604               0.0          0.0          0.0              0.0       0.0   

      PaperlessBilling  PaymentMethod  MonthlyCharges  TotalCharges  
6604               1.0            2.0       -0.678615     -0.968626  


In [19]:
# ordinal encode target variable
label_encoder = LabelEncoder()
# label_encoder.fit(y_train)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [20]:
# Check the unique encoded classes
print(label_encoder.classes_)


['No' 'Yes']


In [21]:
print(y_train)

[1 0 0 ... 1 0 0]


In [22]:
print(X_train.shape)
print(y_train.shape)


(5616, 19)
(5616,)


In [None]:
# Apply SMOTE ONLY on training data
# sm = SMOTE(random_state=42) 
# X_train, y_train = sm.fit_resample(X_train, y_train)

In [23]:
# Save scaler
joblib.dump(scaler, '../models/scaler.pkl')



['../models/scaler.pkl']

In [24]:
# Save preprocessed data
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)


In [25]:
# Converting the target to Pandas series before loading
y_train = pd.Series(y_train, name="Churn")
y_test = pd.Series(y_test, name="Churn")

y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)


print("✅ Preprocessing complete!")

✅ Preprocessing complete!


In [26]:
print(X_train.shape)
print(y_train.shape)

(5616, 19)
(5616,)
