# Exploratory Data Analysis

In [56]:
# Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np

In [57]:
!pip freeze > requirements.txt

In [19]:
# Pulling in Data
df = pd.read_csv('../data/train.csv')

df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [20]:
# Getting unique values of each column to start building feature pipeline
for col in df.columns:
    if col not in ['id', 'Unnamed: 0']:
        print(f"For Column: {col}")
        print(col, df[col].unique())
        print("\n")

For Column: age
age [42 38 36 27 26 24 39 50 46 32 37 57 33 47 35 49 77 40 54 56 51 30 67 58
 59 45 43 29 48 31 28 34 60 52 41 81 23 62 53 72 55 25 44 61 63 69 75 71
 64 22 83 70 80 21 65 74 68 19 73 66 79 20 84 86 76 18 82 78 87 85 88 94
 93 89 92 95 90 91]


For Column: job
job ['technician' 'blue-collar' 'student' 'admin.' 'management' 'entrepreneur'
 'self-employed' 'unknown' 'services' 'retired' 'housemaid' 'unemployed']


For Column: marital
marital ['married' 'single' 'divorced']


For Column: education
education ['secondary' 'primary' 'tertiary' 'unknown']


For Column: default
default ['no' 'yes']


For Column: balance
balance [    7   514   602 ... 13023  4003  8218]


For Column: housing
housing ['no' 'yes']


For Column: loan
loan ['no' 'yes']


For Column: contact
contact ['cellular' 'unknown' 'telephone']


For Column: day
day [25 18 14 28  3 20 21 31  4  8  5 24 11 12 26 30  6 10 19 17  2 13 29 23
 27  7  9 16  1 22 15]


For Column: month
month ['aug' 'jun' 'may' 'feb' 

In [24]:
# Getting dummy vairables
categical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

df_dummies = pd.get_dummies(df, columns=categical_features)

df_dummies.columns

Index(['id', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous', 'y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

In [35]:
# Removing unnecessary columns for prediction
df_clean = df_dummies.drop(columns=['id'])

# Splitting out the x and y values
x_train = df_clean.drop(columns=['y'])
y_train = df_clean['y']

# Splitting out the training and testing data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [38]:
x_train

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
453635,28,5090,12,1297,2,-1,0,False,True,False,...,False,False,True,False,False,False,False,False,False,True
11651,51,1295,27,119,9,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
431999,57,0,29,87,1,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
529211,48,1323,15,83,5,-1,0,False,True,False,...,False,False,True,False,False,False,False,False,False,True
110925,38,659,28,534,4,-1,0,True,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,31,577,28,122,4,181,2,False,False,False,...,False,False,False,False,False,False,True,False,False,False
365838,31,0,28,88,3,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
131932,30,1218,24,48,1,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
671155,44,473,31,77,2,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [41]:
# Logit model
logit_model = LogisticRegression(random_state=42)

# Fitting the model
logit_model.fit(x_train, y_train)

# Predicting the model
y_pred = logit_model.predict(x_test)

# Evaluating the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95    131795
           1       0.66      0.44      0.53     18205

    accuracy                           0.90    150000
   macro avg       0.79      0.71      0.74    150000
weighted avg       0.89      0.90      0.90    150000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
# Create and train the model
gb_model = GradientBoostingClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8
)

gb_model.fit(x_train, y_train)

# Make predictions
y_pred = gb_model.predict(x_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9331


In [61]:
# 1. Define the neural network architecture
class BankDepositNN(nn.Module):
    def __init__(self, input_size):
        super(BankDepositNN, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout(self.relu(self.layer1(x)))
        x = self.dropout(self.relu(self.layer2(x)))
        x = self.dropout(self.relu(self.layer3(x)))
        x = torch.sigmoid(self.output(x))
        return x

# 2. Prepare data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_scaled)

# 3. Initialize model, loss function, and optimizer
model = BankDepositNN(X_train_scaled.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Training loop
epochs = 1000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# 5. Make predictions
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test_tensor)
    y_pred = (y_pred_proba > 0.5).float().numpy().flatten()

# 6. Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Epoch [10/1000], Loss: 0.6085
Epoch [20/1000], Loss: 0.4364
Epoch [30/1000], Loss: 0.3577
Epoch [40/1000], Loss: 0.3113
Epoch [50/1000], Loss: 0.2865
Epoch [60/1000], Loss: 0.2646
Epoch [70/1000], Loss: 0.2437
Epoch [80/1000], Loss: 0.2256
Epoch [90/1000], Loss: 0.2122
Epoch [100/1000], Loss: 0.2031
Epoch [110/1000], Loss: 0.1985
Epoch [120/1000], Loss: 0.1959
Epoch [130/1000], Loss: 0.1930
Epoch [140/1000], Loss: 0.1914
Epoch [150/1000], Loss: 0.1899
Epoch [160/1000], Loss: 0.1884
Epoch [170/1000], Loss: 0.1874
Epoch [180/1000], Loss: 0.1864
Epoch [190/1000], Loss: 0.1852
Epoch [200/1000], Loss: 0.1848
Epoch [210/1000], Loss: 0.1840
Epoch [220/1000], Loss: 0.1831
Epoch [230/1000], Loss: 0.1825
Epoch [240/1000], Loss: 0.1820
Epoch [250/1000], Loss: 0.1815
Epoch [260/1000], Loss: 0.1807
Epoch [270/1000], Loss: 0.1803
Epoch [280/1000], Loss: 0.1798
Epoch [290/1000], Loss: 0.1793
Epoch [300/1000], Loss: 0.1790
Epoch [310/1000], Loss: 0.1786
Epoch [320/1000], Loss: 0.1778
Epoch [330/1000],