In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
data=pd.read_csv('../data/Churn_Modelling.csv')
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])

onehot_encoder_geo = OneHotEncoder(handle_unknown='ignore')
geo_encoded = onehot_encoder_geo.fit_transform(data[['Geography']]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(['Geography']))

data = pd.concat([data.drop('Geography', axis=1), geo_encoded_df], axis=1)

X = data.drop('EstimatedSalary', axis=1)
y = data['EstimatedSalary']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
import pickle

# Save the encoders and scaler for later use
with open('../model/label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('../model/onehot_encoder_geo.pkl', 'wb') as file:
    pickle.dump(onehot_encoder_geo, file)

with open('../model/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

#### ANN Regression Problem statement

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

In [6]:
model.summary()

In [7]:
## compile the model
model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mae'])

In [8]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

# Set up TensorBoard and EarlyStopping callbacks
log_dir = "../regressionlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [9]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=[early_stopping_callback, tensorboard_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 100572.0469 - mae: 100572.0469 - val_loss: 98476.7422 - val_mae: 98476.7422
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 99392.5703 - mae: 99392.5703 - val_loss: 96597.3359 - val_mae: 96597.3359
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 97088.6172 - mae: 97088.6172 - val_loss: 91850.4766 - val_mae: 91850.4766
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 92555.2578 - mae: 92555.2578 - val_loss: 84158.1953 - val_mae: 84158.1953
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 83702.3906 - mae: 83702.3906 - val_loss: 74652.6875 - val_mae: 74652.6875
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 72307.5391 - mae: 72307.5391 - val_loss: 65320.8594 - val_

In [57]:
# Save the model
model.save('../model/regression_model.h5')



In [16]:
## Evaluate model on the test data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# test metrics
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print(f"Test Mean Squared Error: {mse_test:.2f}")
print(f"Test Mean Absolute Error: {mae_test:.2f}")
print(f"Test R² Score: {r2_test:.2f}")

# train metrics
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
print(f"Train Mean Squared Error: {mse_train:.2f}")
print(f"Train Mean Absolute Error: {mae_train:.2f}")
print(f"Train R² Score: {r2_train:.2f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 644us/step
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445us/step
Test Mean Squared Error: 3388818170.63
Test Mean Absolute Error: 50256.61
Test R² Score: -0.03
Train Mean Squared Error: 3322781832.01
Train Mean Absolute Error: 49530.16
Train R² Score: -0.00


### Interestingly, while the ANN regression model shows no signs of overfitting—as indicated by similar training and test MAE—the overall performance remains poor, with a high mae (~50,256) and a negative R² score (-0.03). This suggests that the model is consistently underperforming, not due to model complexity, but likely due to the lack of meaningful correlation between input features and the target variable (EstimatedSalary). The result highlights that even powerful models like ANN cannot compensate for weak or irrelevant features.

In [58]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [60]:
%tensorboard --logdir ../regressionlogs/fit/20250706-154736  # use your own log directory here

Reusing TensorBoard on port 6007 (pid 3608), started 0:00:03 ago. (Use '!kill 3608' to kill it.)