## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings


#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('E:\\Bank_cicd\\notebook\\data\\bank_marketing_deposit_prediction.csv')

In [3]:
df.columns


Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'loan',
       'duration', 'pdays', 'previous', 'poutcome', 'deposit'],
      dtype='object')

#### Show Top 5 Records

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,loan,duration,pdays,previous,poutcome,deposit
0,59,admin,married,secondary,2343,yes,no,1042,-1,0,unknown,yes
1,56,admin,married,secondary,45,no,no,1467,-1,0,unknown,yes
2,41,technician,married,secondary,1270,yes,no,1389,-1,0,unknown,yes
3,55,services,married,secondary,2476,yes,no,579,-1,0,unknown,yes
4,54,admin,married,tertiary,184,no,no,673,-1,0,unknown,yes


In [5]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'loan',
       'duration', 'pdays', 'previous', 'poutcome', 'deposit'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   balance    11162 non-null  int64 
 5   housing    11162 non-null  object
 6   loan       11162 non-null  object
 7   duration   11162 non-null  int64 
 8   pdays      11162 non-null  int64 
 9   previous   11162 non-null  int64 
 10  poutcome   11162 non-null  object
 11  deposit    11162 non-null  object
dtypes: int64(5), object(7)
memory usage: 1.0+ MB


In [7]:
df

Unnamed: 0,age,job,marital,education,balance,housing,loan,duration,pdays,previous,poutcome,deposit
0,59,admin,married,secondary,2343,yes,no,1042,-1,0,unknown,yes
1,56,admin,married,secondary,45,no,no,1467,-1,0,unknown,yes
2,41,technician,married,secondary,1270,yes,no,1389,-1,0,unknown,yes
3,55,services,married,secondary,2476,yes,no,579,-1,0,unknown,yes
4,54,admin,married,tertiary,184,no,no,673,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue_collar,single,primary,1,yes,no,257,-1,0,unknown,no
11158,39,services,married,secondary,733,no,no,83,-1,0,unknown,no
11159,32,technician,single,secondary,29,no,no,156,-1,0,unknown,no
11160,43,technician,married,secondary,0,no,yes,9,172,5,failure,no


In [8]:
import pandas as pd

# Assuming your DataFrame is named df

# Get list of numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Get list of categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Display the lists
print("Numerical Columns:")
print(numerical_columns)
print("\nCategorical Columns:")
print(categorical_columns)


Numerical Columns:
['age', 'balance', 'duration', 'pdays', 'previous']

Categorical Columns:
['job', 'marital', 'education', 'housing', 'loan', 'poutcome', 'deposit']


In [90]:
df['job'].unique()

array(['admin.', 'technician', 'services', 'management', 'retired',
       'blue-collar', 'unemployed', 'entrepreneur', 'housemaid',
       'unknown', 'self-employed', 'student'], dtype=object)

In [101]:
# Find unique categories in each categorical column
for column in categorical_columns:
    unique_categories = df[column].unique()
    print(f"Unique categories in '{column}':")
    print(unique_categories)
    print()


Unique categories in 'job':
['admin' 'technician' 'services' 'management' 'retired' 'blue_collar'
 'unemployed' 'entrepreneur' 'housemaid' 'unknown' 'self_employed'
 'student']

Unique categories in 'marital':
['married' 'single' 'divorced']

Unique categories in 'education':
['secondary' 'tertiary' 'primary' 'unknown']

Unique categories in 'housing':
['yes' 'no']

Unique categories in 'loan':
['no' 'yes']

Unique categories in 'poutcome':
['unknown' 'other' 'failure' 'success']

Unique categories in 'deposit':
['yes' 'no']



In [76]:
X = df.drop(columns=['deposit'],axis=1)   # iv  


In [77]:
y = df['deposit']

In [78]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [79]:
numerical_cols

Index(['age', 'balance', 'duration', 'pdays', 'previous'], dtype='object')

In [80]:
categorical_cols

Index(['job', 'marital', 'education', 'housing', 'loan', 'poutcome'], dtype='object')

In [81]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [82]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [57]:
X = preprocessor.fit_transform(X)

In [58]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((8929, 32), (2233, 32))

#### here we do label encoding WRT to dependent variable 

In [63]:
from sklearn.preprocessing import LabelEncoder

# Encode target variable y
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test= label_encoder.transform(y_test)

In [None]:
# Applying one-hot encoding to the dependent variable (target variable) is not standard practice for binary classification tasks. , if more han 2 we can use it 

# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# # Perform one-hot encoding on encoded target variable
# onehot_encoder = OneHotEncoder(sparse=False)
# y_train_onehot = onehot_encoder.fit_transform(y_train_encoded.reshape(-1, 1))
# y_test_onehot = onehot_encoder.transform(y_test_encoded.reshape(-1, 1))

In [64]:
def evaluate_model(true, predicted):
    # Calculate classification accuracy
    accuracy = accuracy_score(true, predicted)
    
    # Calculate precision, recall, and F1-score for each class
    precision = precision_score(true, predicted, average='macro')
    recall = recall_score(true, predicted, average='macro')
    f1 = f1_score(true, predicted, average='macro')
    
    # Calculate confusion matrix
    cm = confusion_matrix(true, predicted)
    
    return accuracy, precision, recall, f1, cm 

In [65]:
models = {
    "LogisticRegression":  LogisticRegression(),
    "SVC": SVC(),
    "RidgeClassifier": RidgeClassifier(),
    "KNeighborsClassifier":  KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoostClassifier": CatBoostClassifier(verbose=False),
    "AdaBoostClassifier": AdaBoostClassifier()
}
model_list = []
accuracy_list =[]
precision_list = []
recall_list = []
f1_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy, model_train_precision, model_train_recall, model_train_f1, model_train_cm = evaluate_model(y_train, y_train_pred)

    model_test_accuracy, model_test_precision, model_test_recall, model_test_f1, model_test_cm = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print("- Precision: {:.4f}".format(model_train_precision))
    print("- Recall: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_f1))
    print("- Confusion Matrix:\n", model_train_cm)

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print("- Precision: {:.4f}".format(model_test_precision))
    print("- Recall: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_f1))
    print("- Confusion Matrix:\n", model_test_cm)

    accuracy_list.append(model_test_accuracy)

    print('='*35)
    print('\n')

LogisticRegression
Model performance for Training set
- Accuracy: 0.7972
- Precision: 0.7996
- Recall: 0.7941
- F1 Score: 0.7951
- Confusion Matrix:
 [[4006  701]
 [1110 3112]]
----------------------------------
Model performance for Test set
- Accuracy: 0.7904
- Precision: 0.7920
- Recall: 0.7883
- F1 Score: 0.7890
- Confusion Matrix:
 [[975 191]
 [277 790]]


SVC
Model performance for Training set
- Accuracy: 0.8283
- Precision: 0.8278
- Recall: 0.8283
- F1 Score: 0.8280
- Confusion Matrix:
 [[3897  810]
 [ 723 3499]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8003
- Precision: 0.8001
- Recall: 0.8006
- F1 Score: 0.8001
- Confusion Matrix:
 [[924 242]
 [204 863]]


RidgeClassifier
Model performance for Training set
- Accuracy: 0.7851
- Precision: 0.7928
- Recall: 0.7801
- F1 Score: 0.7811
- Confusion Matrix:
 [[4106  601]
 [1318 2904]]
----------------------------------
Model performance for Test set
- Accuracy: 0.7873
- Precision: 0.7929
- Recall

In [66]:

pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'accuracy']).sort_values(by=["accuracy"],ascending=False)

Unnamed: 0,Model Name,accuracy
6,CatBoostClassifier,0.807882
7,AdaBoostClassifier,0.802508
1,SVC,0.800269
5,XGBClassifier,0.799373
0,LogisticRegression,0.790416
2,RidgeClassifier,0.787282
3,KNeighborsClassifier,0.772056
4,DecisionTreeClassifier,0.745634


In [33]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'accuracy']).sort_values(by=["accuracy"],ascending=False)

Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.999148
5,Random Forest Regressor,0.999075
4,Decision Tree,0.998727
0,Linear Regression,0.99801
2,Ridge,0.997843
6,XGBRegressor,0.997056
3,K-Neighbors Regressor,0.982223
8,AdaBoost Regressor,0.960397
1,Lasso,0.954502


In [86]:
lin_model = CatBoostClassifier()
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = accuracy_score(y_test, y_pred)*100
print(f"accuracy score is {score}")
# print(f"Accuracy of the model is "{score})

Learning rate set to 0.026238
0:	learn: 0.6784618	total: 22.8ms	remaining: 22.8s
1:	learn: 0.6643583	total: 40.1ms	remaining: 20s
2:	learn: 0.6520527	total: 56.8ms	remaining: 18.9s
3:	learn: 0.6395129	total: 73.4ms	remaining: 18.3s
4:	learn: 0.6281348	total: 90.5ms	remaining: 18s
5:	learn: 0.6169051	total: 107ms	remaining: 17.7s
6:	learn: 0.6064095	total: 123ms	remaining: 17.5s
7:	learn: 0.5971567	total: 140ms	remaining: 17.3s
8:	learn: 0.5886650	total: 156ms	remaining: 17.1s
9:	learn: 0.5801941	total: 173ms	remaining: 17.1s
10:	learn: 0.5723886	total: 188ms	remaining: 16.9s
11:	learn: 0.5650425	total: 197ms	remaining: 16.2s
12:	learn: 0.5592419	total: 206ms	remaining: 15.6s
13:	learn: 0.5520845	total: 216ms	remaining: 15.2s
14:	learn: 0.5459474	total: 224ms	remaining: 14.7s
15:	learn: 0.5394877	total: 231ms	remaining: 14.2s
16:	learn: 0.5340215	total: 238ms	remaining: 13.7s
17:	learn: 0.5283212	total: 243ms	remaining: 13.3s
18:	learn: 0.5238212	total: 248ms	remaining: 12.8s
19:	learn:

In [89]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
0,0,1,-1
1,1,1,0
2,1,0,1
3,1,1,0
4,0,0,0
...,...,...,...
2228,1,1,0
2229,0,0,0
2230,0,0,0
2231,1,1,0


In [104]:
import os

# Define the path where you want to save the file
folder_path = "E:/BANK_CICD/notebook/data"
file_name = "bank_marketing_deposit_prediction.csv"

# Concatenate the folder path and file name
file_path = os.path.join(folder_path, file_name)

# Save the DataFrame to CSV without the index
df.to_csv(file_path, index=False)

print(f"File saved successfully at: {file_path}")


File saved successfully at: E:/BANK_CICD/notebook/data\bank_marketing_deposit_prediction.csv
