In [142]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelBinarizer, LabelEncoder
import pickle

In [143]:
## read the dataset
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [144]:
## Preprocess the data 
### Drop irrelevent coloumns

data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)  ## axis=1 means we are droping coloumn wise
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [145]:
## in the above dataset grographical and gender are categorical variable so, we will apply some kind of encoding for these variable.  
## you can decide which columns are categorical by identifying columns that contain labels or categories rather than numerical values.
## The fit_transform() method does two things:
## Fit: It finds the unique values in the Gender column    (e.g., "Male" and "Female").
## Transform: It converts each unique label into a number. For instance, "Male" might be encoded as 1 and "Female" as 0.

label_encoder_gender = LabelEncoder()  ##initilizes an instance of LabelEncoder
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data



Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [146]:
## Onehot encode geography coloumn
## we are not using normal LabelEncoding because there are more than 2 countries and suppose we will encode spain->1, germany->0, france->2,, in machine learning everything is about calculating numbers, so it will give more prefrence to france than spain than germnay.

from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo=OneHotEncoder()
geo_encoder=onehot_encoder_geo.fit_transform(data[['Geography']])
geo_encoder

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [147]:
geo_encoder.toarray()  ##converting this Compressed Sparse Row sparse matrix of dtype 'float64'into array


array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [148]:
onehot_encoder_geo.get_feature_names_out(['Geography']) ## Geography_France', 'Geography_Germany', 'Geography_Spain  these are the 3 features that one hot encoder produced after onehotencoding


array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [149]:
## storing this array of featured value(dataframe) in a variavle
geo_encoder_df = pd.DataFrame(geo_encoder.toarray(),columns=onehot_encoder_geo.get_feature_names_out(['Geography']))
geo_encoder_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [150]:
## Combinig one hot encoder features columns with the original data 
data=pd.concat([data.drop('Geography',axis=1),geo_encoder_df],axis=1)  ##here we are droping the geography column because it is already converted into its three features, so now it is not required
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [151]:
## we have used 2 encoding techniques LabelEncoder for gender which is stored in label_encoder_gender and OneHotEncoder for geography which is stored in onehot_encoder_geo.
## now we fill save these file in pickle format

with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_encoder_geo,file)


In [152]:
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [153]:
## here exit is the feature that is dependent and the rest are independent. So, we will devide the dataset into independent and dependent features with the help of train_test_split

X=data.drop('Exited',axis=1) ## X is independent feature
y=data['Exited']  ##y is dependent feature

## split the data into trainig and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## Scale down these features with the help of StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [154]:
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [155]:
## store these scaler file in pickle format
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file) 

ANN IMPLEMENTATION

In [156]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime


In [157]:
## Build our ANN model
## first we will create a sequential model
model = Sequential([
    Dense(64,activation='relu',input_shape=(X_train.shape[1],)),   
    Dense(32,activation='relu'),
    Dense(1,activation='sigmoid')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [158]:
X_train.shape[1]
(X_train.shape[1],)

(12,)

In [159]:
model.summary()

In [160]:
## In order to do the forward and backward propagation we need to compile this model
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

In [None]:
## Setup the tensorboard to start the training
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") ## we have created a logs directory whenever we train our model it will make sure to store all the logs here
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1)


In [162]:
## Set up Early Stopping 
early_stopping_callbacks=EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True)

In [163]:
### Train the model
history=model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callbacks]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.8007 - loss: 0.4868 - val_accuracy: 0.8305 - val_loss: 0.3899
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8458 - loss: 0.3836 - val_accuracy: 0.8525 - val_loss: 0.3606
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8552 - loss: 0.3612 - val_accuracy: 0.8565 - val_loss: 0.3504
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8542 - loss: 0.3486 - val_accuracy: 0.8515 - val_loss: 0.3513
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8609 - loss: 0.3408 - val_accuracy: 0.8560 - val_loss: 0.3420
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8582 - loss: 0.3483 - val_accuracy: 0.8575 - val_loss: 0.3386
Epoch 7/100
[1m250/25

In [164]:
model.save('model.h5')



In [165]:
## Load TensorBoard Extention
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [166]:
%tensorboard --logdir logs/fit20241113-012440/train/

Reusing TensorBoard on port 6007 (pid 18908), started 0:03:12 ago. (Use '!kill 18908' to kill it.)