### Salary Regression from Churn Modelling Data Set . Estimated Salary will be our Dependent Variable.

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [56]:
df_main = pd.read_csv("Churn_Modelling.csv")
df_main.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [57]:
df = df_main.copy()
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [58]:
label_encoder_gender = LabelEncoder()
df['Gender'] = label_encoder_gender.fit_transform(df['Gender'])
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [59]:
# Create a OneHotEncoder object that will return a dense array (not sparse matrix)
ohe_geography = OneHotEncoder(sparse_output=False)
# Transform the 'Geography' column into one-hot encoded format
geo_encoder = ohe_geography.fit_transform(df[['Geography']])
# Display the one-hot encoded representation of the Geography column
geo_encoder

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [60]:
# Create a DataFrame from the one-hot encoded geography features
# - geo_encoder contains the encoded values
# - ohe_geography.get_feature_names_out() provides the column names for each encoded feature
# - Setting the index to match the original dataframe's index to maintain alignment
geo_encoded_df = pd.DataFrame(geo_encoder, columns=ohe_geography.get_feature_names_out(), index=df.index)
geo_encoded_df  # Display the resulting DataFrame with one-hot encoded geography features

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [61]:
# Concatenate the original dataframe (excluding 'Geography' column) with the one-hot encoded geography dataframe
# This effectively replaces the categorical 'Geography' column with its one-hot encoded version
df = pd.concat([df.drop(columns=['Geography']), geo_encoded_df], axis=1)
df.head()  # Display the first 5 rows of the modified dataframe to verify the changes

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [62]:
# Split the data into features and target variable , EstimatedSalary is target variable
X = df.drop(columns=['EstimatedSalary'], axis=1)
y = df['EstimatedSalary']

In [63]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler on the training data and transform both training and testing data
# scale these features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 

In [64]:
# Save the scaler to a file for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
# Save the label encoder to a file for later use
# Save the label encoder and one-hot encoder to pickle files for later use
# This allows us to apply the same transformations to new data in the future
# This serializes the encoder so it can be loaded later to transform new data consistently
# The 'wb' mode opens the file for writing in binary format, required for pickle serialization
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)
with open('ohe_geography.pkl', 'wb') as file:
    pickle.dump(ohe_geography, file)

### ANN Regression Implementation

In [65]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [66]:
#Build the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [67]:
#Compile the Model: Add a compilation step to specify the loss function, optimizer, and evaluation metrics:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
#Train the Model: Train the model using the training data. You can adjust the number of epochs and batch size as needed.
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
#Evaluate the Model: Evaluate the model's performance on the test data.
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13597616128.0000 - mae: 101491.3984 - val_loss: 13697416192.0000 - val_mae: 102035.8281
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 13407715328.0000 - mae: 100561.9922 - val_loss: 13555293184.0000 - val_mae: 101339.8828
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 13023755264.0000 - mae: 98634.2422 - val_loss: 13172127744.0000 - val_mae: 99467.4688
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 12707638272.0000 - mae: 97137.4062 - val_loss: 12484041728.0000 - val_mae: 96090.1328
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 11914359808.0000 - mae: 92927.9531 - val_loss: 11476941824.0000 - val_mae: 91121.3516
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 10

In [68]:
# Save the model to a file for later use
model.save('model_regression.keras')

In [69]:
# Set up TensorBoard for logging training metrics and visualizations
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

# Directory where TensorBoard logs will be saved
log_dir = 'logs/fit'
# Enable histogram logging for visualizing layer activations
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [70]:
## Load tensorboard extension
%load_ext tensorboard
%tensorboard --logdir logs/fit --port 6007

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 24120), started 0:01:27 ago. (Use '!kill 24120' to kill it.)