In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle
from datetime import datetime
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import os


In [None]:
# Load the dataset
data = pd.read_csv('../data/Churn_Modelling.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/Churn_Modelling.csv'

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [None]:
## Pre process the data
### Drop irrelevant features

data =data.drop(["RowNumber","CustomerId", "Surname"], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
## Encode categorical variable

label_encoder_gender = LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])
data.head()


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [None]:
# Initialize OneHotEncoder with sparse=False
on_hot_encoder_geography = OneHotEncoder(sparse_output=False, dtype=int)

# Fit and transform the data (directly returns a dense array)
encoded_array = on_hot_encoder_geography.fit_transform(data[['Geography']])
# Get feature names
feature_names = on_hot_encoder_geography.get_feature_names_out(['Geography'])

# Convert to DataFrame
geography_encoded_df = pd.DataFrame(encoded_array, columns=feature_names)

# Display results
print(geography_encoded_df)


      Geography_France  Geography_Germany  Geography_Spain
0                    1                  0                0
1                    0                  0                1
2                    1                  0                0
3                    1                  0                0
4                    0                  0                1
...                ...                ...              ...
9995                 1                  0                0
9996                 1                  0                0
9997                 1                  0                0
9998                 0                  1                0
9999                 1                  0                0

[10000 rows x 3 columns]


In [None]:
## Merge encoded Df to main DF

data = pd.concat([data.drop(['Geography'], axis=1), geography_encoded_df], axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [None]:
## Save encoders 
pickle_path = "../preprocessors/"

with open(pickle_path+'label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open(pickle_path+'on_hot_encoder_geography.pkl', 'wb') as file:
    pickle.dump(on_hot_encoder_geography, file)

In [None]:
## Split data into features and dependent variable

X = data.drop(['Exited'], axis=1)
y = data[['Exited']]
## Split data into train and test
print(f"Dimention of X: {len(X)}, y: {len(y)}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Dimention of X_train: {len(X_train)}, X_test: {len(X_test)}, y_train: {len(y_train)}, y_test:{len(y_test)}")

# Scale features
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)


Dimention of X: 10000, y: 10000
Dimention of X_train: 6700, X_test: 3300, y_train: 6700, y_test:3300


In [None]:
# Save scalar 
with open(pickle_path+'scalar.pkl', 'wb') as file:
    pickle.dump(scalar, file)

## ANN Implementation


In [None]:
## Build our ANN model
model = Sequential([
    Dense(64, activation ='relu', input_shape = (X_train.shape[1], )), # HL1 connected to input layer 
    Dense(32, activation ='relu'), # HL2 
    Dense(1, activation = 'sigmoid') # OL
])




In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
## Compile the model
import tensorflow
#opt = tensorflow.keras.optimizers.Adam(learning_rate=0.01) # Can be used when you to define your own learning_rate
#model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
## Set up the tensorboard
import datetime
log_dir = "../logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
## Set up Early Stopping 
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
## Train model 
history = model.fit(
    X_train_scaled, 
    y_train, 
    validation_data=(X_test_scaled, y_test), 
    epochs=100, 
    callbacks=[early_stopping_callback, tensorflow_callback]
    )

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [None]:
model.save('../models/model.h5') # H5 is copatible with keras

  saving_api.save_model(


In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/fit/
