In [None]:
#Step 1: import libraries and dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"C:\Users\samue\Downloads\bank-additional-full.csv", sep=';')

In [None]:
#Step 2: Preliminary view of the data

df.head()

In [None]:
df.info()

In [None]:
# The below histogram shows that the target value is imbalance and hence sampling technique need to be applied in the model design.
sns.histplot(data=df, x='y')

   # bank client data:
   1 - age (numeric)
   2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
   3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
   4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
   5 - default: has credit in default? (categorical: "no","yes","unknown")
   6 - housing: has housing loan? (categorical: "no","yes","unknown")
   7 - loan: has personal loan? (categorical: "no","yes","unknown")
   8 - contact: contact communication type (categorical: "cellular","telephone") 
   9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
  10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
  11 - duration: last contact duration, in seconds (numeric).
  12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
  14 - previous: number of contacts performed before this campaign and for this client (numeric)
  15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
  16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
  17 - cons.price.idx: consumer price index - monthly indicator (numeric)     
  18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
  19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
  20 - nr.employed: number of employees - quarterly indicator (numeric)

In [None]:
# Check if there is any duplicate data and drop them
df[df.duplicated()]

# Proceed to drop due to immateiral number of records (<1%)
df.drop_duplicates(inplace=True)

In [None]:
# Split the features between catgorical and numerical
categorical = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
numerical = ['age','duration', 'campaign','pdays','previous','emp.var.rate','cons.conf.idx','euribor3m','nr.employed']
target = 'y'

# Check the number of 'unknown variable'
for i in categorical:
    print(i)
    print(len(df[df[i]=='unknown']))

In [None]:
# Check if the unknown job data would have lots of positive target value
sns.histplot(data=df, x='job', hue='y')
plt.xticks(rotation=90)

In [None]:
# Check if the unknown marital data would have lots of positive target value
sns.histplot(data=df, x='marital', hue='y')
plt.xticks(rotation=90)

In [None]:
# Unknown value for job and marital are immaterial, hence proposed to drop them
# Default, housing and loan are impute as no first. For default column, it will be dropped subsequently due to limited case of default.
# education remain as unknown category

df.drop(df[(df['job'] == 'unknown') | (df['marital'] == 'unknown')].index, inplace=True)
df['default'] = df['default'].apply(lambda x: 'no' if x == 'unknown' else x)
df['housing'] = df['housing'].apply(lambda x: 'no' if x == 'unknown' else x)
df['loan'] = df['loan'].apply(lambda x: 'no' if x == 'unknown' else x)

In [None]:
# Imputation for education
# summarize the most frequent education level by job
job_mapping = df.groupby('job')['education'].apply(lambda x: x.mode().iloc[0])
# transform the summary to dictionary
job_mapping_dict = job_mapping.to_dict()
# apply the mapping to the unknown value
df['education'] = df.apply(lambda row: job_mapping_dict[row['job']] if (row['education'] == 'unknown') else row['education'], axis=1)

In [None]:
# Check if the imputation of education level
sns.histplot(data=df, x='education', hue='y')
plt.xticks(rotation=90)

In [None]:
# Review the numerical details
df[numerical].describe()

In [None]:
len(df[df['campaign']>10])

In [None]:
sns.histplot(data=df, x='pdays', hue='y')

In [None]:
sns.boxplot(data=df, x='previous', y='y')

In [None]:
# drop the duration since it is not known until the call was done
# clear the outliers of campaign since it is not reasonable to call mutliple times in the same campaign (use 10 as a reference based on mean + 3 s.d.)
# drop the pdays column since majority of the value is 999

df.drop(['duration'], axis=1, inplace=True)
df.drop((df[df['campaign'] > (df['campaign'].mean() + 3 * np.std(df['campaign']))]).index, axis=0, inplace=True)


In [None]:
# update the list of numerical value
numerical = ['campaign','previous','pdays','emp.var.rate','cons.conf.idx','euribor3m','nr.employed']
df.describe()

In [None]:
# review the relationship between emp.var.rate and nr.employed
sns.lmplot(data=df, x='emp.var.rate', y='nr.employed')
np.corrcoef(df['emp.var.rate'],df['nr.employed'])

In [None]:
# high relationship between nr.employed and emp.var.rate, hence suggest to remove nr.employed to avoid duplication of features
df.drop(['nr.employed'], axis=1, inplace=True)
numerical = ['campaign','previous','pdays','emp.var.rate','cons.conf.idx','euribor3m']
df.describe()

In [None]:
# plot histogram for categorical variable

for var in categorical:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=var, data=df, hue=target, palette='muted', alpha=0.7)
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.title(f'Stacked Histogram for {var} based on Target')
    plt.legend(title='Target', loc='upper right')
    plt.show()

The above analysis indicate that the success rate has less dependency on the day_of_week and existence of housing loan as they're distributed uniformly acorss the categorical values. Remove default columns since there is only three default cases.  Further, whether a client is default is not relevant to a deposit business.  It matters more for loan business.

In [None]:
for var in numerical:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=target, data=df, y=var)
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.title(f'Boxplot for {var} based on Target')
    plt.legend(title='Target', loc='upper right')
    plt.show()

In [None]:
# check the imbalance of target variable (11% of the data only)

sns.histplot(data=df, x='y')
(df['y']=='yes').sum() / ((df['y']=='yes').sum() + (df['y']=='no').sum())

In [None]:
# # More detailed exploration
# sns.pairplot(df, hue=target)
# plt.show()

In [None]:
# Import vairous liabraries for machine learning

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, classification_report
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
# Summarize the various features used for model building
numerical = ['age','campaign','previous','pdays','cons.conf.idx','euribor3m']
categorical = ['job','marital','education','housing','loan','contact','month','day_of_week','poutcome']
target = 'y'

In [None]:
# Separate features and target
X = df[categorical + numerical]
y = df[target]

# Transform target variable to 1 and 0 using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# build pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, categorical),
                                                 ('num', num_transformer, numerical)])

# Combine preprocessing and SMOTE in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=1234))  # Include SMOTE in the pipeline
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Apply the pipeline on the training data
X_train_transformed, y_train_transformed = pipeline['smote'].fit_resample(pipeline['preprocessor'].fit_transform(X_train), y_train)
X_test_transformed = pipeline['preprocessor'].fit_transform(X_test)

In [None]:

# Build the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
# params_knn = {'n_neighbors': list(range(1,500)), 'weights': ['uniform', 'distance']}
# randomsearch_knn = RandomizedSearchCV(knn_model, params_knn, cv=10, n_iter=100, scoring='recall')

# randomsearch_knn.fit(X_train_transformed, y_train_transformed)

# print("Best parameters from RandomSearch: ", randomsearch_knn.best_params_)

Best parameters from RandomSearch:  {'weights': 'uniform', 'n_neighbors': 7}

In [None]:
# Import vairous liabraries for machine learning

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, classification_report

# Build the KNN model
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC

# Assuming 'X_train' and 'y_train' are your training data and labels
svc_linear_model = SVC(kernel='linear')
svc_linear_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred = svc_linear_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC

# Assuming 'X_train' and 'y_train' are your training data and labels
svc_model = SVC(kernel='rbf', gamma=0.1)
svc_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred = svc_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Assuming 'X_train' and 'y_train' are your training data and labels
gbc_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.01)
gbc_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred = gbc_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
# # Perform cross-validation
# cv_scores_svc = cross_val_score(svc_linear_model, X_train_transformed, y_train_transformed, cv=5, scoring='recall')

# print("Cross-validation scores for KNN: ", cv_scores_svc)

# # Evaluate on the test set
# y_pred_svc = svc_linear_model.predict(X_test_transformed)
# print("Test accuracy for svc: ", recall_score(y_test, y_pred_svc))

In [None]:
# params_gbc = {'n_estimators': list(range(50,500)),'learning_rate': [0.01,0.01,0.1,0.2],'max_depth': list(range(1,5))}

# randomsearch_gbc = RandomizedSearchCV(gbc_model, params_gbc, cv=5, n_iter=50, scoring='recall', random_state=1234)
# randomsearch_gbc.fit(X_train_transformed, y_train_transformed)

# print("Best parameters from RandomSearch: ", randomsearch_gbc.best_params_)

Above code is to explore the hyperparameter tunning for GBC on n_estimators, learning rate and max depth. the code are commented due to the long runtime (~3hours) and the results is pasted as below.

### Best parameters from RandomSearch:  {'n_estimators': 84, 'max_depth': 4, 'learning_rate': 0.2}

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Assuming 'X_train' and 'y_train' are your training data and labels
gbc_model_best = GradientBoostingClassifier(n_estimators=84, max_depth=4, learning_rate=0.2)
gbc_model_best.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred = gbc_model_best.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

In [None]:
# # Perform cross-validation
# cv_scores_gbc = cross_val_score(gbc_model, X_train_transformed, y_train_transformed, cv=5, scoring='recall')

# print("Cross-validation scores for gbc: ", cv_scores_gbc)

# # Evaluate on the test set
# y_pred_gbc = gbc_model.predict(X_test_transformed)
# print("Test accuracy for gbc: ", recall_score(y_test, y_pred_gbc))

In [None]:
# # Perform cross-validation
# cv_scores_gbc_best = cross_val_score(gbc_model_best, X_train_transformed, y_train_transformed, cv=5, scoring='recall')

# print("Cross-validation scores for gbc: ", cv_scores_gbc_best)

# # Evaluate on the test set
# y_pred_gbc_best = gbc_model_best.predict(X_test_transformed)
# print("Test accuracy for gbc: ", recall_score(y_test, y_pred_gbc_best))

Below code is to explore the hyperparameters for SVC model using polynomial kernel. The "C" represents the regularization parameter on the trade-off between margin and classification error.  Larger value on "C" represents thinner margin but lower classification error. "Degree" represents the degree of polynomial applied.

Due to the long runtime (~two hours), this portion of code is commented and the result is extracted below.

### Results: Best parameters from RandomSearch:  {'C': 4.7985894002072, 'degree': 3, 'kernel': 'poly'}

In [None]:
# from scipy.stats import expon
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
# from sklearn.compose import ColumnTransformer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, classification_report
# from sklearn.svm import SVC

# params_svc = {'C': expon(scale=1),
#               'kernel': ['poly'],
#               'degree': [2, 3, 4],  # Applicable for polynomial kernel
#               }

# svc_model_cv = SVC()
# randomsearch_gbc = RandomizedSearchCV(svc_model_cv, params_svc, cv=5, n_iter=10, scoring='recall')
# randomsearch_gbc.fit(X_train_transformed, y_train_transformed)

# print("Best parameters from RandomSearch: ", randomsearch_gbc.best_params_)

In [None]:
from sklearn.svm import SVC

# Assuming 'X_train' and 'y_train' are your training data and labels
svc_model_best = SVC(kernel='poly', C=4.7985894, degree=3)
svc_model_best.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_svc_best = svc_model_best.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_svc_best))
print("Classification Report:")
print(classification_report(y_test, y_pred_svc_best))
print("F1:", f1_score(y_test, y_pred_svc_best))