In [2]:
import pandas as pd;
import numpy as np;
from sklearn.model_selection import train_test_split;
from sklearn.preprocessing import StandardScaler,LabelEncoder;
import pickle

## Load the dataset


In [3]:
data=pd.read_csv('Churn_Modelling.csv')

In [4]:
data.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [5]:
## Data preprocesing

data.drop(columns=['RowNumber','CustomerId','Surname'],axis=1,inplace=True)

In [6]:
data.head(1)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1


In [7]:
## Encode categorical variable
gender_encoder=LabelEncoder()
data['Gender']=gender_encoder.fit_transform(data['Gender'])

## 0 -> female
## 1 -> male

In [8]:
data['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import OneHotEncoder;

In [10]:
## so that no one take more importance or priority
## to maintain balance 

geo_onehot = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse

# Transform data
geo_encoded_array = geo_onehot.fit_transform(data[['Geography']])  # Ensure input is 2D

# Convert to DataFrame
geo_encoded_df = pd.DataFrame(geo_encoded_array, columns=geo_onehot.get_feature_names_out(['Geography']))


In [11]:
geo_encoded_df.head(2)

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0


In [12]:
## Drop geography and add new one hot encoded columns 

data=pd.concat([data.drop('Geography',axis=1),geo_encoded_df],axis=1)

In [13]:
data.head(2)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0


In [14]:
geo_onehot.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [15]:
## Save encoders for later use 
with open('label_encoder_gender.pkl','wb') as f:
    pickle.dump(gender_encoder,f)

  
with open('onehot_encoder_geo.pkl','wb') as f:
    pickle.dump(geo_onehot,f)  

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Gender             10000 non-null  int32  
 2   Age                10000 non-null  int64  
 3   Tenure             10000 non-null  int64  
 4   Balance            10000 non-null  float64
 5   NumOfProducts      10000 non-null  int64  
 6   HasCrCard          10000 non-null  int64  
 7   IsActiveMember     10000 non-null  int64  
 8   EstimatedSalary    10000 non-null  float64
 9   Exited             10000 non-null  int64  
 10  Geography_France   10000 non-null  float64
 11  Geography_Germany  10000 non-null  float64
 12  Geography_Spain    10000 non-null  float64
dtypes: float64(5), int32(1), int64(7)
memory usage: 976.7 KB


In [17]:
data.isnull().sum()

CreditScore          0
Gender               0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_France     0
Geography_Germany    0
Geography_Spain      0
dtype: int64

In [18]:
data.describe()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,0.5457,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037,0.5014,0.2509,0.2477
std,96.653299,0.497932,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769,0.500023,0.433553,0.431698
min,350.0,0.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,0.0,0.0,0.0
25%,584.0,0.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0,0.0,0.0,0.0
50%,652.0,1.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0,1.0,0.0,0.0
75%,718.0,1.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0,1.0,1.0,0.0
max,850.0,1.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0,1.0,1.0,1.0


In [19]:
data.columns

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited',
       'Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype='object')

## Divide the dataset into features ad labels

In [20]:
X = data.drop('Exited', axis=1)
y = data['Exited']

In [21]:
X.columns

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_France',
       'Geography_Germany', 'Geography_Spain'],
      dtype='object')

## Split the data into test train 


In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [23]:
print(X.shape,X_train.shape,X_test.shape)
print(y.shape,y_train.shape,y_test.shape)


(10000, 12) (8000, 12) (2000, 12)
(10000,) (8000,) (2000,)


## Scale the data 

In [24]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [25]:
y.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [26]:
X_train,X_test

(array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
         -0.57946723, -0.57638802],
        [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
          1.72572313, -0.57638802],
        [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
         -0.57946723,  1.73494238],
        ...,
        [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
         -0.57946723, -0.57638802],
        [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
         -0.57946723, -0.57638802],
        [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
          1.72572313, -0.57638802]]),
 array([[-0.57749609,  0.91324755, -0.6557859 , ..., -0.99850112,
          1.72572313, -0.57638802],
        [-0.29729735,  0.91324755,  0.3900109 , ...,  1.00150113,
         -0.57946723, -0.57638802],
        [-0.52560743, -1.09499335,  0.48508334, ..., -0.99850112,
         -0.57946723,  1.73494238],
        ...,
        [ 0.81311987, -1.09499335,  0.77030065, ...,  

## save the scaler for later use 


In [27]:
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)

# ANN Implementaion

In [28]:
# !pip install tensorflow


In [29]:
import tensorflow as tf

#### Steps 

1. dense from keras is used to form hidden neuron 

2. if dense=50 means in hidden layer 50 neurons present 
3. activation function can be sigmoid , relu ,leakyrelu,elu,prelu
4. optimizors -> fro back propagation -> for updating weights
5. loss function-> we have to reduce it 
metrics-> for accuracy , mse , mae




In [30]:
from tensorflow.keras.layers import Dense  # to create hidden neurons
from tensorflow.keras.models import Sequential # to create sequence 
from tensorflow.keras.optimizers import Adam # use for optimizers 

import datetime 

no of inputs will be no of xtrain.columns

In [31]:
(X_train.shape[1],) ## Single dimension has 12 inputs 

(12,)

In [None]:
## Build our ANN model
import warnings
warnings.filterwarnings("ignore")
model=Sequential(
    [
        ## First Hidden layer Connected with input layer with 64 neurons 
        Dense(64,activation='relu',input_shape=(X_train.shape[1],)),
        ## Hidden layer 2 with 32 neurons
        Dense(32,activation='relu'), 
        ## Output layer use sigmoid as activation fucntion as it is classification problem
        Dense(1,activation='sigmoid'),
    ]
)

In [33]:
model.summary()

In [None]:
import tensorflow
## Which optimizer to use  to define our own learning rate otherwise simply write adam while compile time 
opt=tf.keras.optimizers.Adam(learning_rate=0.01)
## Which loss to use 
losses=tensorflow.keras.losses.BinaryCrossentropy()

In [35]:
## Compile the model
model.compile(optimizer=opt,loss="binary_crossentropy",metrics=['accuracy'])

In [None]:
# ## Setup the TensorBoard
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
log_dir="log/fit/"+datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorflow_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)

- epochs -> number of iterations to train the model
- Setup Early Stopping
- if loss value dec then we need to run epoch 
- if loss is not much decreasing then we can stop further epochs 
- for this we are using early stopping 



In [38]:

early_stopping_callbacks = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True  # Restores the model weights from the epoch with the best validation loss
)

- Choose any param to monitor 
- patience means that much epochs will definitely happen after that it will check where to stop
- the model retains the weights from the last epoch before stopping, which might not be the best-performing ones.


In [40]:
## Train the model
hist =model.fit(X_train,y_train,
                validation_data=(X_test,y_test),
                epochs=100,
                callbacks=[tensorflow_callback,early_stopping_callbacks])


Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8102 - loss: 0.4459 - val_accuracy: 0.8595 - val_loss: 0.3481
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8621 - loss: 0.3461 - val_accuracy: 0.8485 - val_loss: 0.3566
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8606 - loss: 0.3422 - val_accuracy: 0.8630 - val_loss: 0.3536
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8602 - loss: 0.3418 - val_accuracy: 0.8585 - val_loss: 0.3433
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8572 - loss: 0.3459 - val_accuracy: 0.8545 - val_loss: 0.3567
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8529 - loss: 0.3547 - val_accuracy: 0.8630 - val_loss: 0.3419
Epoch 7/100
[1m250/25

In [41]:
model.save('model.h5')



In [42]:
## Load tensorboard Extension
# %load_ext tensorboard

In [43]:
# %tensorboard --logdir log/fit

## Load the trained model , scaler , encoders


In [None]:
from tensorflow.keras.models import load_model

model = load_model('model.h5')
scaler = pickle.load(open('scaler.pkl', 'rb'))
gender_encoders = pickle.load(open('label_encoder_gender.pkl', 'rb'))
geo_encoders = pickle.load(open('onehot_encoder_geo.pkl', 'rb'))





### Create input data for predictions

In [None]:

input_data={
    
    'CreditScore':619,
    'Geography':'France',
    'Gender':'Female',
    'Age':42,
    'Tenure':2,
    'Balance': 0,
    'NumOfProducts': 1,
    'HasCrCard': 1,
    'IsActiveMember':1,
    'EstimatedSalary':101348.88
}


In [46]:
input_data['Gender']=gender_encoders.transform([input_data['Gender']])[0]


In [47]:
input_data['Gender']

0

In [48]:
coder = geo_encoders.transform(pd.DataFrame([input_data['Geography']]))
df = pd.DataFrame(coder, columns=geo_encoders.get_feature_names_out(['Geography']))
df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [49]:
input_df=pd.DataFrame([input_data])

In [50]:
input_data=pd.concat([input_df.drop('Geography',axis=1),df],axis=1)
input_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0,1,1,1,101348.88,1.0,0.0,0.0


In [51]:
input_data=scaler.transform(input_data)

In [52]:
input_data


array([[-0.33880827, -1.09499335,  0.29493847, -1.04241787, -1.21847056,
        -0.91668767,  0.64920267,  0.97481699,  0.01595384,  1.00150113,
        -0.57946723, -0.57638802]])

In [54]:
prediction=model.predict(input_data)[0][0]
print("Probabilty of customer to churn ",prediction)

if(prediction > 0.5):
    print("The person is likely to Churn")
else:
    print("The person is unlikely to Churn")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Probabilty of customer to churn  0.22818014
The person is unlikely to Churn
