In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [None]:
data = pd.read_csv("Churn_Modelling.csv")
data.head()

In [None]:
# Preprocess the data
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data.head()

In [None]:
# Encode Categorical Variable
label_encoder_gender = LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])
data

In [5]:
# For more than 2 categories dont use label encoder as it assigns value hierarchy to each label
# Use One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
ohe_geo = OneHotEncoder()
geo_encoder = ohe_geo.fit_transform(data[['Geography']])

In [None]:
ohe_geo.get_feature_names_out(['Geography'])

In [None]:
geo_encoder.toarray()

In [8]:
encoded_df = pd.DataFrame(geo_encoder.toarray(), columns=ohe_geo.get_feature_names_out(['Geography']))

In [None]:
encoded_df

In [10]:
## Combine one hot encoded columns with the original data
# Since it has already run once and dropped geography, it shows this error. 
data=pd.concat([data.drop('Geography', axis=1), encoded_df], axis=1)

In [None]:
data.head()

In [12]:
## Save the encoders and scaler
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('ohe_geo.pkl', 'wb') as file:
    pickle.dump(ohe_geo, file)

In [13]:
# Divide the dataset into independent and dependent features
X=data.drop('Exited', axis=1)
y=data['Exited']

# Split data into training adn testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale these features
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [14]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

### ANN Implementation

0. When we start initialising an ANN we have to define a sequential network
1. If we're creating a hidden layer, then we're going to be using dense
2. Actication Function -> Sigmoid, Relu, Leaky Relu, tanh
3. Optimizer -> useful in Back propogation -> responsible for updating the weights
4. Loss function - Try to reduce this
5. Metrics -> Accuracy, precision, f1 for classification MSE, RSMLE for regression

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [None]:
# Build ANN Model
# Create a sequential model inside which we're going to be using dense layers
# Inside our sequential model we'll have to determine the number of inputs which will be equal to the number of independent features in our dataset.
# In our dense layer, we'll create our hidden layer. Also input shape given as data to hidden layer.
model = Sequential(
    [
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)), # First hidden layer connected with input layer
        Dense(32, activation="relu"), # HL 2
        Dense(1, activation="sigmoid") # Output layer
    ]
)

In [None]:
model.summary()

##### Parameters are nothing but a combination of weights and bias per each layer

In [None]:
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate=0.1)
loss = tensorflow.keras.losses.BinaryCrossentropy()
loss

In [25]:
## Compile the model
# For mukticlassification use sparse crossentropy
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

In [28]:
## Setup tensorboard: TensorBoard is a tool for providing the measurements and visualizations needed during the machine learning workflow. It enables tracking experiment metrics like loss and accuracy, visualizing the model graph, projecting embeddings to a lower dimensional space, and much more.
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

log_dir = "logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback=TensorBoard(log_dir=log_dir, histogram_freq=1)

In [32]:
# Serup EarlyStopping - When we're training a neural network, we can train it for multiple epochs continuously. If we see a scenario that after a certain number of epochs the model accuracy is not increasing, so in order to save time its better to simply stop the model training at that time ignoring insignificant increments in the accuracy of the model
early_stopping_callback = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

### Training the Model

In [None]:
history=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, callbacks=[tensorflow_callback, early_stopping_callback])

In [None]:
# h5 is compatible with keras
model.save('model.h5')

In [37]:
%load_ext tensorboard