In [1]:
!pip install -r requirements.txt --quiet



In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
from dotenv import load_dotenv

load_dotenv("../.env")

True

In [40]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "shrutimechlearn/churn-modelling",path='Churn_Modelling.csv').set_index('RowNumber').reset_index(drop=True)

  df = kagglehub.load_dataset(


## Split X and y

In [62]:
X = df.drop(columns=['Exited','Surname'])
y = df['Exited']

In [63]:
# Check to make sure size of X and y are the same
assert X.shape[0] == y.shape[0], "X and y do not have same amount of rows"


## Encode categorical variables

In [64]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   CreditScore      10000 non-null  int64  
 2   Geography        10000 non-null  object 
 3   Gender           10000 non-null  object 
 4   Age              10000 non-null  int64  
 5   Tenure           10000 non-null  int64  
 6   Balance          10000 non-null  float64
 7   NumOfProducts    10000 non-null  int64  
 8   HasCrCard        10000 non-null  int64  
 9   IsActiveMember   10000 non-null  int64  
 10  EstimatedSalary  10000 non-null  float64
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [65]:
label_encoder = LabelEncoder()
X['Gender'] = label_encoder.fit_transform(X['Gender'])

#### One hot encode - Geography Feature

In [66]:
column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Geography'])], remainder='passthrough')
X = column_transformer.fit_transform(X)

In [67]:
X[:][:5]

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.5634602e+07,
        6.1900000e+02, 0.0000000e+00, 4.2000000e+01, 2.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.0134888e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.5647311e+07,
        6.0800000e+02, 0.0000000e+00, 4.1000000e+01, 1.0000000e+00,
        8.3807860e+04, 1.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.1254258e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.5619304e+07,
        5.0200000e+02, 0.0000000e+00, 4.2000000e+01, 8.0000000e+00,
        1.5966080e+05, 3.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.1393157e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.5701354e+07,
        6.9900000e+02, 0.0000000e+00, 3.9000000e+01, 1.0000000e+00,
        0.0000000e+00, 2.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        9.3826630e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.5737888e+07,
        8.5000000e+0

### Split Train, Test, Validation

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4134)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=4134)

print("Size of Train, Validation and Test set")
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

Size of Train, Validation and Test set
Train: (6400, 13), Validation: (1600, 13), Test: (2000, 13)


### Data Normalization

In [74]:
print(f"Max: {round(X_train.min(), 4)}, Min: {round(X_train.max(), 4)}")

Max: 0.0, Min: 15815690.0


In [76]:
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)

In [81]:

print(f"Max X_train: {round(X_train.min(), 4)}, Min X_train: {round(X_train.max(), 4)}")

Max X_train: -3.1547, Min X_train: 5.08


In [78]:
X_val = standard_scaler.transform(X_val)
X_test = standard_scaler.transform(X_test)

In [82]:
print(f"Max X_val: {round(X_val.min(), 4)}, Min X_val: {round(X_val.max(), 4)}")
print(f"Max X_test: {round(X_test.min(), 4)}, Min X_test: {round(X_test.max(), 4)}")


Max X_val: -3.1443, Min X_val: 4.4104
Max X_test: -3.1547, Min X_test: 5.08


### Save out data for model

In [83]:

train_data_path = "data/train_binary_data"
validation_data_path = "data/validation_binary_data"
test_data_path = "data/test_binary_data"

In [85]:
np.savez(train_data_path, x=X_train, y=y_train)
np.savez(validation_data_path, x=X_val, y=y_val)
np.savez(test_data_path, x=X_test, y=y_test)