Importing Required Libraries

In [23]:
#Imporing Required libraries

import numpy as np
import pandas as pd
import urllib.request
import zipfile
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE

Downloading and Extracting Data Set

In [24]:
# URL for bank-additional dataset
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip'
dataset_path = 'bank-additional.zip'
extracted_folder = 'bank-additional'

print("Downloading dataset...")
urllib.request.urlretrieve(dataset_url, dataset_path)

# Extract the dataset
print("Extracting dataset...")
with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

# Load the CSV file
data_file_path = os.path.join(extracted_folder, 'bank-additional', 'bank-additional-full.csv')
data = pd.read_csv(data_file_path, sep=';')

print("Dataset loaded successfully.")
print("First few rows of the dataset:")
print(data.head())



Downloading dataset...
Extracting dataset...
Dataset loaded successfully.
First few rows of the dataset:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         

Step 2 : Preprocessing

In [25]:
# Handle missing values for numeric and categorical columns
numeric_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill missing values in numeric columns with the mean
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Fill missing values in categorical columns with the mode (if there are any missing values)
for col in categorical_cols:
    if data[col].isnull().any():
        data[col] = data[col].fillna(data[col].mode().iloc[0])

# Separate the target variable before one-hot encoding
y = data['y'].apply(lambda x: 1 if x == 'yes' else 0)  # Target: Convert 'yes' to 1 and 'no' to 0

# Drop the target variable from the features
X = data.drop('y', axis=1)

# Encode categorical variables using one-hot encoding on the features only
X = pd.get_dummies(X, drop_first=True)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



STEP 3: Handling Class Balance using SMOT

In [26]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)



Step 4: Feature Engineering - Dimensonality Redcution using PCA

In [27]:
pca = PCA(n_components=0.95)  # Retain 95% variance
X_resampled_pca = pca.fit_transform(X_resampled)
X_test_pca = pca.transform(X_test)


Step 5 - Hyperparameter Tuning for Random Forest


In [29]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Reduced the number of trees to make tuning faster
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search_rf = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_grid_rf, 
    n_iter=50,  
    cv=3,  
    n_jobs=-1, 
    verbose=2,
    random_state=42
)
random_search_rf.fit(X_resampled_pca, y_resampled)
best_rf = random_search_rf.best_estimator_



Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   8.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  15.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; tot

Step 6 - SVM Model Training with Hyperparameter Tuning

In [31]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm = SVC(probability=True, random_state=42)
random_search_svm = RandomizedSearchCV(
    estimator=svm, 
    param_distributions=param_grid_svm, 
    n_iter=12, 
    cv=3, 
    n_jobs=-1, 
    verbose=2,
    random_state=42
)
random_search_svm.fit(X_resampled_pca, y_resampled)
best_svm = random_search_svm.best_estimator_


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 2.5min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 2.5min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 2.6min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 2.7min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 2.9min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 2.9min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 4.0min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 4.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 4.4min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 4.8min
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time= 2.8min
[CV] END ......................C=0.1, gamma=auto

Step 7 - Cross Validation using Stratified K-Fold


In [32]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cross_val_rf = cross_val_score(best_rf, X_resampled_pca, y_resampled, cv=skf)
cross_val_svm = cross_val_score(best_svm, X_resampled_pca, y_resampled, cv=skf)

print("Random Forest Cross-Validation Scores:", cross_val_rf)
print("Random Forest Mean Cross-Validation Score:", cross_val_rf.mean())
print("SVM Cross-Validation Scores:", cross_val_svm)
print("SVM Mean Cross-Validation Score:", cross_val_svm.mean())

Random Forest Cross-Validation Scores: [0.94401016 0.93960715 0.94762044 0.94947718 0.94693638]
Random Forest Mean Cross-Validation Score: 0.9455302646308569
SVM Cross-Validation Scores: [0.94557358 0.94722955 0.95055214 0.95348383 0.95006352]
SVM Mean Cross-Validation Score: 0.9493805223575048


Model Evaluation Test Set Using Random Forest Evlaution

In [33]:
y_pred_rf = best_rf.predict(X_test_pca)
y_pred_rf_proba = best_rf.predict_proba(X_test_pca)[:, 1]

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("Random Forest Test Set Evaluation:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Test Set Evaluation:
Accuracy: 0.8928542526503197
Precision: 0.5180084745762712
Recall: 0.7025862068965517
F1 Score: 0.5963414634146341
Confusion Matrix:
 [[10055   910]
 [  414   978]]


Model Evaluation Test Set Using SVM Method

In [34]:
y_pred_svm = best_svm.predict(X_test_pca)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print("SVM Test Set Evaluation:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


SVM Test Set Evaluation:
Accuracy: 0.8803916808286801
Precision: 0.4753722794959908
Recall: 0.5962643678160919
F1 Score: 0.5289993626513703
Confusion Matrix:
 [[10049   916]
 [  562   830]]


Adjusting Decision Threshold for Random Forest

In [35]:
y_pred_new_rf = (y_pred_rf_proba >= 0.3).astype(int)
precision_rf_new = precision_score(y_test, y_pred_new_rf)
recall_rf_new = recall_score(y_test, y_pred_new_rf)
f1_rf_new = f1_score(y_test, y_pred_new_rf)

print("Random Forest with Adjusted Threshold (0.3):")
print("Precision:", precision_rf_new)
print("Recall:", recall_rf_new)
print("F1 Score:", f1_rf_new)


Random Forest with Adjusted Threshold (0.3):
Precision: 0.40774193548387094
Recall: 0.9080459770114943
F1 Score: 0.5627782724844167


Final recommendation Based on Evaluation

In [36]:
# Step 10: Final Recommendations based on Evaluation
if f1_rf_new > f1_svm:
    print("Random Forest with adjusted threshold is recommended based on improved F1 score.")
elif f1_svm > f1_rf_new:
    print("SVM is recommended based on better F1 score.")
else:
    print("Both models perform similarly; further tuning might be necessary.")
    

Random Forest with adjusted threshold is recommended based on improved F1 score.
