In [10]:
# Import required libraries
import numpy as np
import pandas as pd
import urllib.request
import zipfile
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
# Step 1: Download and Extract Dataset
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip'
dataset_path = 'bank-additional.zip'
extracted_folder = 'bank-additional'

# Download and Extract Dataset
print("Downloading dataset...")
urllib.request.urlretrieve(dataset_url, dataset_path)
print("Extracting dataset...")
with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

# Load the CSV file
data_file_path = os.path.join(extracted_folder, 'bank-additional', 'bank-additional-full.csv')
data = pd.read_csv(data_file_path, sep=';')

print("Dataset loaded successfully.")
print("First few rows of the dataset:")
print(data.head())

Downloading dataset...
Extracting dataset...
Dataset loaded successfully.
First few rows of the dataset:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         

In [4]:
# Step 2: Data Preprocessing
# Convert target column to binary
data['y'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# One-hot encoding for categorical features
data = pd.get_dummies(data, drop_first=True)

# Define features and target
X = data.drop('y', axis=1)
y = data['y']

In [5]:
# Step 3: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
# Step 4: Handle Class Imbalance Using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [7]:
# Step 5: Feature Scaling
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(X_test)

In [8]:
# Step 6: Apply PCA for Dimensionality Reduction
pca = PCA(n_components=0.95)  # Retain 95% variance
X_resampled_pca = pca.fit_transform(X_resampled)
X_test_pca = pca.transform(X_test)

In [9]:
# Step 7: Hyperparameter Tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Define Stratified K-Fold with 5 splits
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid_rf,
    n_iter=30,
    cv=stratified_kfold,
    n_jobs=-1,
    verbose=2,
    random_state=42
)
random_search_rf.fit(X_resampled_pca, y_resampled)
best_rf = random_search_rf.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time= 1.2min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time= 1.2min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=5,

In [12]:
#Hyperparameter Tuning for logisitc regression

param_grid_lr = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear'],  # Common solvers for small/medium datasets
    'penalty': ['l2'],  # Regularization type
    'max_iter': [100, 200, 500]  # Number of iterations
}

# Define Stratified K-Fold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression
lr = LogisticRegression(random_state=42)

# Perform Randomized Search for Logistic Regression
random_search_lr = RandomizedSearchCV(
    estimator=lr,
    param_distributions=param_grid_lr,
    n_iter=20,  # Number of hyperparameter combinations to try
    cv=stratified_kfold,  # Use Stratified K-Fold
    n_jobs=-1,  # Use all available cores
    verbose=2,
    random_state=42
)

# Fit Logistic Regression on resampled data
random_search_lr.fit(X_resampled_pca, y_resampled)

# Get the best estimator
best_lr = random_search_lr.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ........C=1, max_iter=200, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END ........C=1, max_iter=200, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END ........C=1, max_iter=200, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END ........C=1, max_iter=200, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END ........C=1, max_iter=200, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .......C=10, max_iter=500, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .......C=10, max_iter=500, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .......C=10, max_iter=500, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .......C=10, max_iter=500, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END ......C=0.1, max_iter=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ......C=0.1, max_iter=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ......C=0.1, max_iter=100, penalty=l2,

In [22]:
# # # Step 8: Hyperparameter Tuning for SVM
# # param_grid_svm = {
# #     'C': [0.1, 1, 10],
# #     'kernel': ['linear', 'rbf', 'poly'],
# #     'gamma': ['scale', 'auto']
# # }

# # # Define Stratified K-Fold with 5 splits
# # stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # svm = SVC(probability=True, random_state=42)
# # random_search_svm = RandomizedSearchCV(
# #     estimator=svm,
# #     param_distributions=param_grid_svm,
# #     n_iter=20,
# #     cv=stratified_kfold,
# #     n_jobs=-1,
# #     verbose=2,
# #     random_state=42
# # )
# # random_search_svm.fit(X_resampled_pca, y_resampled)
# # best_svm = random_search_svm.best_estimator_

# # Step 1: Initialize the SVC model
# 

param_grid_svm = {
    'C': np.logspace(-1, 1, 3),
    'kernel': ['rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
svm = SVC(probability=False, random_state=42)
random_search_svm = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid_svm,
    n_iter=20,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2,
    random_state=42
)
random_search_svm.fit(X_resampled, y_resampled)
best_svm = random_search_svm.best_estimator_



Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  41.8s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  42.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  42.6s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  42.9s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  43.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  43.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  43.9s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  44.6s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  44.7s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  44.7s
[CV] END .....................C=0.1, gamma=auto, kernel=poly; total time=  40.6s
[CV] END .....................C=0.1, gamma=auto, kernel=poly; total time=  40.2s
[CV] END ...................

In [34]:
# Step 9: Ensemble Voting Classifier with Logistic Regression
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf),  # Random Forest
    ('svm', best_svm),  # SVM
    ('lr', best_lr)  # Logistic Regression
], voting='soft')  # 'soft' voting uses predicted probabilities

# Train the Voting Classifier
voting_clf.fit(X_resampled_pca, y_resampled)


In [11]:

# Step 10: Model Evaluation on Test Set
# Random Forest Evaluation
y_pred_rf = best_rf.predict(X_test_pca)
y_pred_rf_proba = best_rf.predict_proba(X_test_pca)[:, 1]

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("Random Forest Test Set Evaluation:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Test Set Evaluation:
Accuracy: 0.8930161042324188
Precision: 0.5223214285714286
Recall: 0.5883620689655172
F1 Score: 0.5533783783783783
Confusion Matrix:
 [[10216   749]
 [  573   819]]


In [None]:
#Logistic Regression

# Apply PCA transformation to the test data
X_test_pca = pca.transform(X_test)

# Predictions using Logistic Regression
y_pred_lr = best_lr.predict(X_test_pca)

# Evaluate Metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

# Print Evaluation Results
print("Logistic Regression Test Set Evaluation:")
print("Accuracy:", accuracy_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1 Score:", f1_lr)


Logistic Regression Test Set Evaluation:
Accuracy: 0.9011896091284293
Precision: 0.5540113708149084
Recall: 0.6300287356321839
F1 Score: 0.5895798319327731


In [32]:
# PCA Transformation on Training Data
X_train_pca = pca.fit_transform(X_resampled)  # Fit PCA on the training data
best_svm.fit(X_train_pca, y_resampled)  # Train SVM on PCA-transformed training data

# Apply PCA Transformation on Test Data
X_test_pca = pca.transform(X_test)  # Use the same PCA object fitted on training data

# Check feature dimensions before evaluation
print("Shape of PCA-transformed test data:", X_test_pca.shape)
print("Number of features expected by SVM:", best_svm.n_features_in_)
assert X_test_pca.shape[1] == best_svm.n_features_in_, "Feature dimension mismatch!"

# SVM Evaluation
y_pred_svm = best_svm.predict(X_test_pca)

# Evaluate SVM Performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print("SVM Test Set Evaluation:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


Shape of PCA-transformed test data: (12357, 40)
Number of features expected by SVM: 40
SVM Test Set Evaluation:
Accuracy: 0.9043457149793639
Precision: 0.5830696202531646
Recall: 0.5294540229885057
F1 Score: 0.5549698795180723
Confusion Matrix:
 [[10438   527]
 [  655   737]]


In [33]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Perform Cross-Validation for Random Forest
cross_val_rf = cross_val_score(best_rf, X_resampled_pca, y_resampled, cv=skf)
print("Random Forest Cross-Validation Scores:", cross_val_rf)
print("Random Forest Mean Cross-Validation Score:", cross_val_rf.mean())

# Perform Cross-Validation for SVM
cross_val_svm = cross_val_score(best_svm, X_resampled_pca, y_resampled, cv=skf)
print("SVM Cross-Validation Scores:", cross_val_svm)
print("SVM Mean Cross-Validation Score:", cross_val_svm.mean())

# Step 8: Train Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_resampled_pca, y_resampled)

# Perform Cross-Validation for Logistic Regression
cross_val_lr = cross_val_score(log_reg, X_resampled_pca, y_resampled, cv=skf)
print("Logistic Regression Cross-Validation Scores:", cross_val_lr)
print("Logistic Regression Mean Cross-Validation Score:", cross_val_lr.mean())


Random Forest Cross-Validation Scores: [0.9377565  0.9334506  0.93696863 0.94205023 0.93579595]
Random Forest Mean Cross-Validation Score: 0.9372043827519214
SVM Cross-Validation Scores: [0.94352159 0.94439558 0.94224568 0.94889084 0.94586143]
SVM Mean Cross-Validation Score: 0.9449830250836572
Logistic Regression Cross-Validation Scores: [0.93062341 0.92778266 0.92905306 0.93237565 0.92563276]
Logistic Regression Mean Cross-Validation Score: 0.9290935087772472


In [35]:
# Voting Classifier Evaluation
y_pred_voting = voting_clf.predict(X_test_pca)

accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting)
recall_voting = recall_score(y_test, y_pred_voting)
f1_voting = f1_score(y_test, y_pred_voting)

print("Voting Classifier Test Set Evaluation:")
print("Accuracy:", accuracy_voting)
print("Precision:", precision_voting)
print("Recall:", recall_voting)
print("F1 Score:", f1_voting)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_voting))

AttributeError: This 'SVC' has no attribute 'predict_proba'

In [36]:
# Step 11: Threshold Optimization for Random Forest
thresholds = np.arange(0.1, 0.9, 0.1)
for threshold in thresholds:
    y_pred_new_rf = (y_pred_rf_proba >= threshold).astype(int)
    precision = precision_score(y_test, y_pred_new_rf)
    recall = recall_score(y_test, y_pred_new_rf)
    f1 = f1_score(y_test, y_pred_new_rf)
    print(f"Threshold: {threshold}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Threshold: 0.1, Precision: 0.2547709923664122, Recall: 0.959051724137931, F1 Score: 0.4025934861278649
Threshold: 0.2, Precision: 0.349957971420566, Recall: 0.8972701149425287, F1 Score: 0.5035275146139891
Threshold: 0.30000000000000004, Precision: 0.4265313791807591, Recall: 0.8153735632183908, F1 Score: 0.5600789538613373
Threshold: 0.4, Precision: 0.47539417104634496, Recall: 0.7147988505747126, F1 Score: 0.5710186513629842
Threshold: 0.5, Precision: 0.5194479297365119, Recall: 0.5948275862068966, F1 Score: 0.5545880776959142
Threshold: 0.6000000000000001, Precision: 0.5860113421550095, Recall: 0.4454022988505747, F1 Score: 0.5061224489795918
Threshold: 0.7000000000000001, Precision: 0.6368159203980099, Recall: 0.27586206896551724, F1 Score: 0.3849624060150376
Threshold: 0.8, Precision: 0.7083333333333334, Recall: 0.1221264367816092, F1 Score: 0.20833333333333334
