# Setup & Load Data

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("../data/raw/churn_data.csv")

# Quick sanity check
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


# Drop Irrelevant Columns

In [13]:
df = df.drop(columns=['customer_id'])

# Check for Missing or Duplicated Data

In [14]:
# Missing values
print(df.isnull().sum())

# Duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

# Remove duplicates if any
df = df.drop_duplicates()


credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64
Duplicate rows: 0


# Encode Categorical Variables

We have two categorical columns:
- country (France, Spain, Germany)
- gender (Male, Female)


In [15]:
# One-hot encode 'country'
df = pd.get_dummies(df, columns=['country'], drop_first=True)

# Binary encode 'gender'
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

In [16]:
df.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_Germany,country_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,0,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,0,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,0,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,0,43,2,125510.82,1,1,1,79084.1,0,False,True


# Split into Features and Target

In [17]:
X = df.drop('churn', axis=1)
y = df['churn']

# Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
) 

# Stratify ensures that the train and test sets have the same proportion of samples from each class as the original dataset (y). This is crucial for classification problems to prevent one set from having an imbalanced class distribution.


# Handle Skewness or Scale Differences (NO leakage)

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

num_features = ['credit_score', 'age', 'tenure', 'balance',
                'products_number', 'estimated_salary']

# Transform train
X_train[num_features] = scaler.fit_transform(X_train[num_features])

# Transform test

X_test[num_features] = scaler.transform(X_test[num_features])

In [25]:
X_test.head(100)

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_Germany,country_Spain
5702,-0.680735,1,-0.279932,0.684723,-1.226059,0.808830,1,0,-0.095021,False,False
3667,-1.301915,1,-0.564935,-0.350971,0.877113,0.808830,0,0,-0.778941,True,False
1617,-0.970619,0,0.100072,-0.350971,-1.226059,0.808830,0,1,0.099469,False,True
5673,-0.121674,1,-0.469934,-0.005739,1.011458,0.808830,0,0,-1.147374,False,True
4272,-0.111321,0,-0.469934,-0.696202,0.023204,-0.910256,1,1,1.200283,False,True
...,...,...,...,...,...,...,...,...,...,...,...
958,-1.239797,1,-1.134940,1.029954,0.902034,-0.910256,0,0,-1.613560,False,True
1303,-0.597911,0,-0.659935,1.375185,0.926437,0.808830,1,0,-0.798858,True,False
2798,-0.742853,1,-1.039939,0.339492,1.187000,-0.910256,1,0,0.799344,True,False
2415,0.571977,1,-0.659935,0.339492,0.290623,-0.910256,1,1,0.823612,False,False


In [28]:
X_train.to_csv('../data/processed/X_train_processed.csv', index=False)
X_test.to_csv('../data/processed/X_test_processed.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

## Save Scalar

In [33]:
import pickle

with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)