In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [3]:
from sklearn.model_selection import train_test_split


In [4]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
#preprocessing
cols_to_drop = [col for col in ['RowNumber', 'CustomerId', 'Surname'] if col in data.columns]
data.drop(cols_to_drop, axis=1, inplace=True)
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [6]:
#encoding categorical data
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [7]:
# #one hot encoding for Geography
# data = pd.get_dummies(data, columns=['Geography'], drop_first=True)
# data

In [8]:
# Reload the data to restore 'Geography'
import pandas as pd
data = pd.read_csv('Churn_Modelling.csv')

# Drop unnecessary columns
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Label encode Gender
from sklearn.preprocessing import LabelEncoder
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])

# One hot encode Geography using sklearn
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
geo_encoder = onehotencoder.fit_transform(data[['Geography']])
feature_names = onehotencoder.get_feature_names_out(['Geography'])
print(feature_names)
geo_encoded_df = pd.DataFrame(geo_encoder.toarray(), columns=feature_names)

data = pd.concat([data, geo_encoded_df], axis=1)
data.drop('Geography', axis=1, inplace=True)
data.head()
#save the label encoder and one hot encoder
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f)
with open('onehotencoder_geography.pkl', 'wb') as f:
    pickle.dump(onehotencoder, f)

['Geography_France' 'Geography_Germany' 'Geography_Spain']


In [9]:
#divide dataset into independent and dependent features
x = data.drop('Exited', axis=1)
y = data['Exited']
#split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#feature scaling   
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
#save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)




In [10]:
#ANN Implementation
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

#build our own ANN model
model = Sequential([Dense(units=32, activation='relu', input_shape=(x_train.shape[1],)), #input layer and first hidden layer
                    Dense(units=16, activation='relu'), #second hidden layer
                    Dense(units=1, activation='sigmoid') #output layer
                   ])
model

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<Sequential name=sequential, built=True>

In [11]:
model.summary()

In [12]:
import tensorflow
#compile the model
optimizer = tensorflow.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
#setup the tensorboard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [14]:
#early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [15]:
#Train the model
history = model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping, tensorboard_callback])

Epoch 1/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7472 - loss: 0.5311 - val_accuracy: 0.8112 - val_loss: 0.4388
Epoch 2/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8102 - loss: 0.4328 - val_accuracy: 0.8331 - val_loss: 0.4102
Epoch 3/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8258 - loss: 0.4092 - val_accuracy: 0.8406 - val_loss: 0.3948
Epoch 4/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8397 - loss: 0.3866 - val_accuracy: 0.8462 - val_loss: 0.3744
Epoch 5/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8459 - loss: 0.3667 - val_accuracy: 0.8494 - val_loss: 0.3635
Epoch 6/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8544 - loss: 0.3546 - val_accuracy: 0.8500 - val_loss: 0.3567
Epoch 7/100
[1m200/

In [16]:
#save the model
model.save('ann_model.h5')



In [2]:
#load the tensorboard extension
%load_ext tensorboard

In [5]:
tensorboard --logdir logs/fit --port 6007

In [10]:
from tensorflow.keras.models import load_model
import pickle

# Load the trained model
model = load_model('ann_model.h5')

# Load the scaler
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Load the label encoder for gender
with open('label_encoder_gender.pkl', 'rb') as f:
    label_encoder_gender = pickle.load(f)

# Load the one hot encoder for geography
with open('onehotencoder_geography.pkl', 'rb') as f:
    onehotencoder_geography = pickle.load(f)



In [13]:
import pandas as pd
input_data = pd.DataFrame([{
    'CreditScore': 600,
    'Geography': 'France',
    'Gender': 'Male',
    'Age': 40,
    'Tenure': 3,
    'Balance': 60000,
    'NumOfProducts': 2,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'EstimatedSalary': 50000
}])
# Preprocess the input data
input_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,Male,40,3,60000,2,1,1,50000


In [14]:
# combine one hot encoding and input data
input_data = pd.concat([input_data, pd.DataFrame(onehotencoder_geography.transform(input_data[['Geography']]).toarray(), columns=onehotencoder_geography.get_feature_names_out(['Geography']))], axis=1)
input_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,600,France,Male,40,3,60000,2,1,1,50000,1.0,0.0,0.0


In [15]:
#encode categorical data
input_data['Gender'] = label_encoder_gender.transform(input_data['Gender'])
input_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,600,France,1,40,3,60000,2,1,1,50000,1.0,0.0,0.0


In [16]:
#concatinatinn one hot encoding
input_data.drop('Geography', axis=1, inplace=True)
input_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,600,1,40,3,60000,2,1,1,50000,1.0,0.0,0.0


In [17]:
#scaling the input data
input_data = scaler.transform(input_data)
input_data

array([[-0.53598516,  0.91324755,  0.10479359, -0.69539349, -0.25781119,
         0.80843615,  0.64920267,  0.97481699, -0.87683221,  1.00150113,
        -0.57946723, -0.57638802]])

In [None]:
#predictions
prediction = model.predict(input_data)
prediction
print(prediction[0])
prediction = (prediction > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[0]
