In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [3]:
data = pd.read_csv(r'P:\NLP\ANN_project_implementation\Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
data = data.drop(['RowNumber','CustomerId','Surname'],axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data.head(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0
5,645,Spain,1,44,8,113755.78,2,1,0,149756.71,1
6,822,France,1,50,7,0.0,2,1,1,10062.8,0
7,376,Germany,0,29,4,115046.74,4,1,0,119346.88,1
8,501,France,1,44,4,142051.07,2,0,1,74940.5,0
9,684,France,1,27,2,134603.88,1,1,1,71725.73,0


In [6]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 859.5+ KB


In [7]:
### Using onehot encoding for 'Geography' column because it has more than two categories and no ordinal relationship
from sklearn.preprocessing import OneHotEncoder

onehot_encoder_geo = OneHotEncoder()
geo_encoded = onehot_encoder_geo.fit_transform(data[['Geography']]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(['Geography']))
data = pd.concat([data.drop('Geography', axis=1), geo_encoded_df], axis=1)

In [8]:
data.head(10)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0
5,645,1,44,8,113755.78,2,1,0,149756.71,1,0.0,0.0,1.0
6,822,1,50,7,0.0,2,1,1,10062.8,0,1.0,0.0,0.0
7,376,0,29,4,115046.74,4,1,0,119346.88,1,0.0,1.0,0.0
8,501,1,44,4,142051.07,2,0,1,74940.5,0,1.0,0.0,0.0
9,684,1,27,2,134603.88,1,1,1,71725.73,0,1.0,0.0,0.0


In [9]:
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f)

with open('onehot_encoder_geo.pkl', 'wb') as f:
    pickle.dump(onehot_encoder_geo, f)

In [10]:
x = data.drop('Exited', axis=1)
y = data['Exited']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [11]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard    
import datetime

  if not hasattr(np, "object"):


In [13]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)), ## first hidden layer connected to input layer
    Dense(32, activation='relu'), ## second hidden layer
    Dense(1, activation='sigmoid') ## output layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
model.summary()

In [15]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

In [16]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [18]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=100,
    callbacks=[early_stopping_callback, tensorboard_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8334 - loss: 0.3987 - val_accuracy: 0.8550 - val_loss: 0.3494
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8568 - loss: 0.3561 - val_accuracy: 0.8600 - val_loss: 0.3383
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8596 - loss: 0.3459 - val_accuracy: 0.8575 - val_loss: 0.3467
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8599 - loss: 0.3429 - val_accuracy: 0.8580 - val_loss: 0.3387
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8627 - loss: 0.3363 - val_accuracy: 0.8600 - val_loss: 0.3506
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8609 - loss: 0.3353 - val_accuracy: 0.8635 - val_loss: 0.3402
Epoch 7/100
[1m250/25

In [19]:
model.save('model.h5')



In [20]:
%load_ext tensorboard
%tensorboard --logdir logs/fit/

Reusing TensorBoard on port 6006 (pid 26580), started 12:48:53 ago. (Use '!kill 26580' to kill it.)

In [21]:
prediction = model.predict(x_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 968us/step


In [22]:
prediction

array([[0.0719779 ],
       [0.02185553],
       [0.11537045],
       ...,
       [0.54948336],
       [0.08078443],
       [0.26425865]], shape=(2000, 1), dtype=float32)

In [23]:
out_df = pd.DataFrame(x_test,columns=x.columns)
out_df['predicted_exited'] = prediction
out_df['churn/not churn'] = ['churn' if pred > 0.5 else 'not churn' for pred in prediction]
out_df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,predicted_exited,churn/not churn
0,-0.577496,0.913248,-0.655786,-0.695393,0.329937,0.808436,-1.540351,-1.025834,-1.019605,-0.998501,1.725723,-0.576388,0.071978,not churn
1,-0.297297,0.913248,0.390011,-1.389442,-1.218471,0.808436,0.649203,0.974817,0.798883,1.001501,-0.579467,-0.576388,0.021856,not churn
2,-0.525607,-1.094993,0.485083,-0.348369,-1.218471,0.808436,0.649203,-1.025834,-0.72798,-0.998501,-0.579467,1.734942,0.11537,not churn
3,-1.511492,0.913248,1.91117,1.039728,0.689272,0.808436,0.649203,0.974817,1.221387,-0.998501,1.725723,-0.576388,0.255642,not churn
4,-0.951094,-1.094993,-1.131148,0.692704,0.782839,-0.916688,0.649203,0.974817,0.24756,-0.998501,-0.579467,1.734942,0.073889,not churn


In [24]:
x_test_df = pd.DataFrame(x_test, columns=x.columns)
x_test_df.to_csv('x_test.csv', index=False)

In [26]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred.round()))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step
              precision    recall  f1-score   support

           0       0.88      0.95      0.92      1607
           1       0.72      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.80      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [27]:
from tensorflow.keras.models import load_model

model = load_model("model.h5")
model.summary()   # shows input/output shapes and layer details

