In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [2]:
## load the dataset
df=pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
## Preprocess the data
## Drop irrelavant features
df=df.drop(['RowNumber','CustomerId','Surname'],axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:

df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

## Now we have to apply encoding for the categorical variable here we have 2 they are 
1. Geography
2. Gender

In [5]:
## Encode categorical variables
lb=LabelEncoder()
df['Gender']=lb.fit_transform(df['Gender'])
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


here for this Geography if we use label encoding then it will convert values into 0,1,2 since we have 3 categories, then after while evaluvating we have numbers so in that case the model will think that 2>1>0 so by this for only one purticular geography based on the numbers it will give the importance so inorder to prevent it from happening we use.. another label encoding technique called one hot encoding since it will only markdown the values as 0's and 1's

In [6]:
## One hot encoding Geography
from sklearn.preprocessing import OneHotEncoder
oh=OneHotEncoder()
geo_encoder=oh.fit_transform(df[['Geography']])

In [7]:
oh.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [8]:
geo_encoded_df=pd.DataFrame(geo_encoder.toarray(),columns=oh.get_feature_names_out(['Geography']))
geo_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [9]:
### combining all the geography columns that are encoded using one hot encoder with the original data
df=pd.concat([df.drop('Geography',axis=1),geo_encoded_df],axis=1)
df.head()


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [10]:
## save the encoders and scaler
import pickle
with open('lb.pkl','wb') as file:
    pickle.dump(lb,file)
with open('oh.pkl','wb') as file:
    pickle.dump(oh,file)

Divide the dataset into Independent and Dependent Features

In [11]:
X=df.drop('Exited',axis=1)
y=df['Exited']

Split the data into train and test data

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
X_train.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
9254,686,1,32,6,0.0,2,1,1,179093.26,1.0,0.0,0.0
1561,632,1,42,4,119624.6,2,1,1,195978.86,0.0,1.0,0.0
1670,559,1,24,3,114739.92,1,1,0,85891.02,0.0,0.0,1.0
6087,561,0,27,9,135637.0,1,1,0,153080.4,1.0,0.0,0.0
6669,517,1,56,9,142147.32,1,0,0,39488.04,1.0,0.0,0.0


Scaling the features that are in different range into a single range using StandardScaler

In [14]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [15]:
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [16]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

## ANN Implementation

In [17]:
import tensorflow as tf



### 
1.seqential network
2.Dense 
3.Activation function
4.optimizer
5.loss function
6.metrics
7.training data--->logs--->folder---->TensorBoard---->Visualization

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

## Build our CNN model

In [24]:
model=Sequential([
    Dense(64,activation='relu',input_shape=(X_train.shape[1],)), ## Hidden Layer 1 Connected with input layer
    Dense(32,activation='relu'), # Hidden Layer 2
    Dense(1,activation='sigmoid') # output layer

])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
model.summary()

now we have created layers.. in order to do forward and backward propagation we need to compile this 

In [None]:
## we can define the optimizer and losses in two ways
## either we directly do it by using the name or we can manually do it just like above
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.01)

In [35]:
## compile the model
model.compile(optimizer=opt,loss="binary_crossentropy",metrics=['accuracy'])

In [41]:
## set up the TensorBoard
import datetime

# get current datetime
now = datetime.datetime.now()

# format it
log_directory = "logs/fit/" + now.strftime("%Y%m%d-%H%M%S")
print(log_directory)

logs/fit/20250820-153500


In [42]:
tensorflow_callback=TensorBoard(log_dir=log_directory,histogram_freq=1)

In [45]:
## set up early stoping
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True) 

In [46]:
## Training the model 
history=model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8665 - loss: 0.3271 - val_accuracy: 0.8555 - val_loss: 0.3486
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8650 - loss: 0.3249 - val_accuracy: 0.8585 - val_loss: 0.3407
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8648 - loss: 0.3225 - val_accuracy: 0.8585 - val_loss: 0.3358
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8658 - loss: 0.3171 - val_accuracy: 0.8605 - val_loss: 0.3553
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8685 - loss: 0.3197 - val_accuracy: 0.8565 - val_loss: 0.3526
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8679 - loss: 0.3155 - val_accuracy: 0.8555 - val_loss: 0.3453
Epoch 7/100
[1m250/250[0m [32m━

In [47]:
model.save('model.h5')



In [50]:
## load tensorboard extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [51]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 11712), started 0:01:00 ago. (Use '!kill 11712' to kill it.)

In [None]:
### Load the pickle file 
