Importing Required Libraries

In [45]:
import numpy as np
import pandas as pd
import urllib.request
import zipfile
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE



Downloading and Extracting Data Set

In [46]:
# Step 1: Download and Extract Dataset
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip'
dataset_path = 'bank-additional.zip'
extracted_folder = 'bank-additional'

# Download and Extract Dataset
print("Downloading dataset...")
urllib.request.urlretrieve(dataset_url, dataset_path)
print("Extracting dataset...")
with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

# Load the CSV file
data_file_path = os.path.join(extracted_folder, 'bank-additional', 'bank-additional-full.csv')
data = pd.read_csv(data_file_path, sep=';')

print("Dataset loaded successfully.")
print("First few rows of the dataset:")
print(data.head())



Downloading dataset...
Extracting dataset...
Dataset loaded successfully.
First few rows of the dataset:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         

Step 2 : Preprocessing

In [47]:
# Convert target column to binary
data['y'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# One-hot encoding for categorical features
data = pd.get_dummies(data, drop_first=True)

# Define features and target
X = data.drop('y', axis=1)
y = data['y']


Step 3: Splitting the Data into Training and Test sets

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

STEP 4: Handling Class Balance using SMOT

In [49]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)



Step 5: Feature Scaling

In [50]:
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(X_test)

Step 4: Feature Engineering - Dimensonality Redcution using PCA

In [51]:

pca = PCA(n_components=0.95)  # Retain 95% variance
X_resampled_pca = pca.fit_transform(X_resampled)
X_test_pca = pca.transform(X_test)



Step 5 - Hyperparameter Tuning for Random Forest


In [52]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],  # Reduced the number of trees to make tuning faster
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search_rf = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_grid_rf, 
    n_iter=20,  
    cv=3,  
    n_jobs=-1, 
    verbose=2,
    random_state=42
)
random_search_rf.fit(X_resampled_pca, y_resampled)
best_rf = random_search_rf.best_estimator_



Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  20.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  20.2s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  20.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  20.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  20.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  21.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  21.2s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n

Step 6 - SVM Model Training with Hyperparameter Tuning

In [53]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm = SVC(probability=True, random_state=42)
random_search_svm = RandomizedSearchCV(
    estimator=svm, 
    param_distributions=param_grid_svm, 
    n_iter=12, 
    cv=3, 
    n_jobs=-1, 
    verbose=2,
    random_state=42
)
random_search_svm.fit(X_resampled_pca, y_resampled)
best_svm = random_search_svm.best_estimator_


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.1min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 1.1min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.8min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 1.8min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 1.8min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.9min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 2.7min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 2.8min
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time= 1.1min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.6min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.6min
[CV] END ....................C=1, gamma=scale, k

Step 7 - Cross Validation using Stratified K-Fold


In [54]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cross_val_rf = cross_val_score(best_rf, X_resampled_pca, y_resampled, cv=skf)
cross_val_svm = cross_val_score(best_svm, X_resampled_pca, y_resampled, cv=skf)

print("Random Forest Cross-Validation Scores:", cross_val_rf)
print("Random Forest Mean Cross-Validation Score:", cross_val_rf.mean())
print("SVM Cross-Validation Scores:", cross_val_svm)
print("SVM Mean Cross-Validation Score:", cross_val_svm.mean())

Random Forest Cross-Validation Scores: [0.93863592 0.93305971 0.93775042 0.94126845 0.93540506]
Random Forest Mean Cross-Validation Score: 0.9372239101745434
SVM Cross-Validation Scores: [0.94420559 0.93960715 0.94068211 0.94732727 0.94312518]
SVM Mean Cross-Validation Score: 0.9429894614367397


Model Evaluation Test Set Using Random Forest Evlaution

In [55]:
y_pred_rf = best_rf.predict(X_test_pca)
y_pred_rf_proba = best_rf.predict_proba(X_test_pca)[:, 1]

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("Random Forest Test Set Evaluation:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Test Set Evaluation:
Accuracy: 0.8945536942623614
Precision: 0.5285073670723895
Recall: 0.5926724137931034
F1 Score: 0.558753809685066
Confusion Matrix:
 [[10229   736]
 [  567   825]]


Model Evaluation Test Set Using SVM Method

In [56]:
y_pred_svm = best_svm.predict(X_test_pca)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print("SVM Test Set Evaluation:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


SVM Test Set Evaluation:
Accuracy: 0.9058833050093065
Precision: 0.5809187279151944
Recall: 0.5905172413793104
F1 Score: 0.5856786604916281
Confusion Matrix:
 [[10372   593]
 [  570   822]]


Adjusting Decision Threshold for Random Forest

In [57]:
# y_pred_new_rf = (y_pred_rf_proba >= 0.3).astype(int)
# precision_rf_new = precision_score(y_test, y_pred_new_rf)
# recall_rf_new = recall_score(y_test, y_pred_new_rf)
# f1_rf_new = f1_score(y_test, y_pred_new_rf)

# print("Random Forest with Adjusted Threshold (0.3):")
# print("Precision:", precision_rf_new)
# print("Recall:", recall_rf_new)
# print("F1 Score:", f1_rf_new)

thresholds = np.arange(0.1, 0.9, 0.1)
for threshold in thresholds:
    y_pred_new_rf = (y_pred_rf_proba >= threshold).astype(int)
    precision = precision_score(y_test, y_pred_new_rf)
    recall = recall_score(y_test, y_pred_new_rf)
    f1 = f1_score(y_test, y_pred_new_rf)
    print(f"Threshold: {threshold}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Threshold: 0.1, Precision: 0.24645683784281244, Recall: 0.9619252873563219, F1 Score: 0.3923809523809524
Threshold: 0.2, Precision: 0.34455172413793106, Recall: 0.8972701149425287, F1 Score: 0.49790711580625874
Threshold: 0.30000000000000004, Precision: 0.42626034612490593, Recall: 0.8139367816091954, F1 Score: 0.5595061728395062
Threshold: 0.4, Precision: 0.4717607973421927, Recall: 0.7140804597701149, F1 Score: 0.5681623320948842
Threshold: 0.5, Precision: 0.526117054751416, Recall: 0.6005747126436781, F1 Score: 0.5608856088560885
Threshold: 0.6000000000000001, Precision: 0.5799043062200957, Recall: 0.4353448275862069, F1 Score: 0.49733278621255644
Threshold: 0.7000000000000001, Precision: 0.6412478336221837, Recall: 0.26580459770114945, F1 Score: 0.37582529202640935
Threshold: 0.8, Precision: 0.7030567685589519, Recall: 0.11566091954022989, F1 Score: 0.1986428130783467


Final recommendation Based on Evaluation

In [58]:
# Step 10: Final Recommendations based on Evaluation
if f1_rf_new > f1_svm:
    print("Random Forest with adjusted threshold is recommended based on improved F1 score.")
elif f1_svm > f1_rf_new:
    print("SVM is recommended based on better F1 score.")
else:
    print("Both models perform similarly; further tuning might be necessary.")
    

SVM is recommended based on better F1 score.
