In [None]:
#Step 1: import libraries and dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"C:\Users\samue\Downloads\bank-additional-full.csv", sep=';')

In [None]:
#Step 2: Preliminary view of the data

df.head()

In [None]:
df.info()

In [None]:
# The below histogram shows that the target value is imbalance and hence sampling technique need to be applied in the model design.
sns.histplot(data=df, x='y')

   # bank client data:
   1 - age (numeric)
   2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
   3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
   4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
   5 - default: has credit in default? (categorical: "no","yes","unknown")
   6 - housing: has housing loan? (categorical: "no","yes","unknown")
   7 - loan: has personal loan? (categorical: "no","yes","unknown")
   8 - contact: contact communication type (categorical: "cellular","telephone") 
   9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
  10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
  11 - duration: last contact duration, in seconds (numeric).
  12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
  14 - previous: number of contacts performed before this campaign and for this client (numeric)
  15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
  16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
  17 - cons.price.idx: consumer price index - monthly indicator (numeric)     
  18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
  19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
  20 - nr.employed: number of employees - quarterly indicator (numeric)

In [None]:
# Check if there is any duplicate data and drop them
df[df.duplicated()]

# Proceed to drop due to immateiral number of records (<1%)
df.drop_duplicates(inplace=True)

In [None]:
# Split the features between catgorical and numerical
categorical = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
numerical = ['age','duration', 'campaign','pdays','previous','emp.var.rate','cons.conf.idx','euribor3m','nr.employed']
target = 'y'

# Check the number of 'unknown variable'
for i in categorical:
    print(i)
    print(len(df[df[i]=='unknown']))

In [None]:
# Check if the unknown job data would have lots of positive target value
sns.histplot(data=df, x='job', hue='y')
plt.xticks(rotation=90)

In [None]:
# Check if the unknown marital data would have lots of positive target value
sns.histplot(data=df, x='marital', hue='y')
plt.xticks(rotation=90)

In [None]:
# Unknown value for job and marital are immaterial, hence proposed to drop them
# Default, housing and loan are impute as no first. For default column, it will be dropped subsequently due to limited case of default.
# education remain as unknown category

df.drop(df[(df['job'] == 'unknown') | (df['marital'] == 'unknown')].index, inplace=True)
df['default'] = df['default'].apply(lambda x: 'no' if x == 'unknown' else x)
df['housing'] = df['housing'].apply(lambda x: 'no' if x == 'unknown' else x)
df['loan'] = df['loan'].apply(lambda x: 'no' if x == 'unknown' else x)

In [None]:
# Imputation for education
# summarize the most frequent education level by job
job_mapping = df.groupby('job')['education'].apply(lambda x: x.mode().iloc[0])
# transform the summary to dictionary
job_mapping_dict = job_mapping.to_dict()
# apply the mapping to the unknown value
df['education'] = df.apply(lambda row: job_mapping_dict[row['job']] if (row['education'] == 'unknown') else row['education'], axis=1)

In [None]:
# Check if the imputation of education level
sns.histplot(data=df, x='education', hue='y')
plt.xticks(rotation=90)

In [None]:
# Review the numerical details
df[numerical].describe()

In [None]:
len(df[df['campaign']>10])

In [None]:
sns.histplot(data=df, x='pdays', hue='y')

In [None]:
sns.boxplot(data=df, x='previous', y='y')

In [None]:
# drop the duration since it is not known until the call was done
# clear the outliers of campaign since it is not reasonable to call mutliple times in the same campaign (use 10 as a reference based on mean + 3 s.d.)
# drop the pdays column since majority of the value is 999

df.drop(['duration'], axis=1, inplace=True)
df.drop((df[df['campaign'] > (df['campaign'].mean() + 3 * np.std(df['campaign']))]).index, axis=0, inplace=True)


In [None]:
# update the list of numerical value
numerical = ['campaign','previous','pdays','emp.var.rate','cons.conf.idx','euribor3m','nr.employed']
df.describe()

In [None]:
# review the relationship between emp.var.rate and nr.employed
sns.lmplot(data=df, x='emp.var.rate', y='nr.employed')
np.corrcoef(df['emp.var.rate'],df['nr.employed'])

In [None]:
# high relationship between nr.employed and emp.var.rate, hence suggest to remove nr.employed to avoid duplication of features
df.drop(['nr.employed'], axis=1, inplace=True)
numerical = ['age','campaign','previous','pdays','emp.var.rate','cons.conf.idx','euribor3m']
df.describe()

In [None]:
# plot histogram for categorical variable

for var in categorical:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=var, data=df, hue=target, palette='muted', alpha=0.7)
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.title(f'Stacked Histogram for {var} based on Target')
    plt.legend(title='Target', loc='upper right')
    plt.show()

The above analysis indicate that the success rate has less dependency on the day_of_week and existence of housing loan as they're distributed uniformly acorss the categorical values. Remove default columns since there is only three default cases.  Further, if the clients default, the bank will be benefited from taking deposits to them to reduce the overall credit exposures. Hence, default is less relevant in the term desposit subscription scenario.

In [None]:
for var in numerical:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=target, data=df, y=var)
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.title(f'Boxplot for {var} based on Target')
    plt.legend(title='Target', loc='upper right')
    plt.show()

In [None]:
# check the imbalance of target variable (11% of the data only)

sns.histplot(data=df, x='y')
(df['y']=='yes').sum() / ((df['y']=='yes').sum() + (df['y']=='no').sum())

In [None]:
# see distribution for all numeric variables
df.hist(figsize=(20,10))

In [None]:
# draw a graph to see the corrlations between variables
sns.heatmap(df[numerical].corr(), annot=True, cmap='coolwarm')

The above heatmap shows that there is high correlation between emp.var.rate and euribor3m.  Thus, we'll incldue either one of them into our model.

In [None]:
# More detailed exploration
sns.pairplot(df, hue=target)
plt.show()

### Below codes is to check the impact of removing certain unimprotant features based on the exploratory analysis above.

In [None]:
# Import vairous liabraries for machine learning

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, classification_report
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

#### Results when including full list of features

In [None]:
# Summarize the various features used for model building
numerical = ['age','campaign','previous','pdays','emp.var.rate','cons.conf.idx','euribor3m']
categorical = ['job','marital','education','housing','loan','contact','month','day_of_week','poutcome']
target = 'y'

# Separate features and target
X = df[categorical + numerical]
y = df[target]

# Transform target variable to 1 and 0 using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# build pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, categorical),
                                                 ('num', num_transformer, numerical)])

# Combine preprocessing and SMOTE in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=1234))  # Include SMOTE in the pipeline
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Apply the pipeline on the training data
X_train_transformed, y_train_transformed = pipeline['smote'].fit_resample(pipeline['preprocessor'].fit_transform(X_train), y_train)
X_test_transformed = pipeline['preprocessor'].fit_transform(X_test)

# Build the LogisticRegression model
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

#### Results when removing empvar (cumulative from above)

In [None]:
# Summarize the various features used for model building
numerical = ['age','campaign','previous','pdays','cons.conf.idx','euribor3m']
categorical = ['job','marital','education','housing','loan','contact','month','day_of_week','poutcome']
target = 'y'

# Separate features and target
X = df[categorical + numerical]
y = df[target]

# Transform target variable to 1 and 0 using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# build pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, categorical),
                                                 ('num', num_transformer, numerical)])

# Combine preprocessing and SMOTE in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=1234))  # Include SMOTE in the pipeline
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Apply the pipeline on the training data
X_train_transformed, y_train_transformed = pipeline['smote'].fit_resample(pipeline['preprocessor'].fit_transform(X_train), y_train)
X_test_transformed = pipeline['preprocessor'].fit_transform(X_test)

# Build the LogisticRegression model
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

#### Results when removing housing and day of week (cumulative from above)

In [None]:
# Summarize the various features used for model building
numerical = ['age','campaign','previous','pdays','cons.conf.idx','euribor3m']
categorical = ['job','marital','education','loan','contact','month','poutcome']
target = 'y'

# Separate features and target
X = df[categorical + numerical]
y = df[target]

# Transform target variable to 1 and 0 using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# build pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, categorical),
                                                 ('num', num_transformer, numerical)])

# Combine preprocessing and SMOTE in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=1234))  # Include SMOTE in the pipeline
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Apply the pipeline on the training data
X_train_transformed, y_train_transformed = pipeline['smote'].fit_resample(pipeline['preprocessor'].fit_transform(X_train), y_train)
X_test_transformed = pipeline['preprocessor'].fit_transform(X_test)

# Build the LogisticRegression model
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

#### Results when pdays (cumulative from above)

In [None]:
# Summarize the various features used for model building
numerical = ['age','campaign','previous','cons.conf.idx','euribor3m']
categorical = ['job','marital','education','loan','contact','month','poutcome']
target = 'y'

# Separate features and target
X = df[categorical + numerical]
y = df[target]

# Transform target variable to 1 and 0 using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# build pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, categorical),
                                                 ('num', num_transformer, numerical)])

# Combine preprocessing and SMOTE in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=1234))  # Include SMOTE in the pipeline
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Apply the pipeline on the training data
X_train_transformed, y_train_transformed = pipeline['smote'].fit_resample(pipeline['preprocessor'].fit_transform(X_train), y_train)
X_test_transformed = pipeline['preprocessor'].fit_transform(X_test)

# Build the LogisticRegression model
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

### The below code is the actual model building after confirming the list of features.

In [None]:
# Import vairous liabraries for machine learning

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, classification_report, fbeta_score
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
# Summarize the various features used for model building
numerical = ['age','campaign','previous','cons.conf.idx','euribor3m']
categorical = ['job','marital','education','loan','contact','month','poutcome']
target = 'y'

In [None]:
# Separate features and target
X = df[categorical + numerical]
y = df[target]

# Transform target variable to 1 and 0 using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# build pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, categorical),
                                                 ('num', num_transformer, numerical)])

# Combine preprocessing and SMOTE in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=1234))  # Include SMOTE in the pipeline
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Apply the pipeline on the training data
X_train_transformed, y_train_transformed = pipeline['smote'].fit_resample(pipeline['preprocessor'].fit_transform(X_train), y_train)
X_test_transformed = pipeline['preprocessor'].fit_transform(X_test)

In [None]:
# Build the LogisticRegression model
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("F1:", f1_score(y_test, y_pred_lr))
print("F2:", fbeta_score(y_test, y_pred_lr, beta=2))

In [None]:

# Build the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("F1:", f1_score(y_test, y_pred_knn))
print("F2:", fbeta_score(y_test, y_pred_knn, beta=2))

In [None]:
from sklearn.svm import SVC

# Assuming 'X_train' and 'y_train' are your training data and labels
svc_linear_model = SVC(kernel='linear')
svc_linear_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_svc_linear = svc_linear_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_svc_linear))
print("Classification Report:")
print(classification_report(y_test, y_pred_svc_linear))
print("F1:", f1_score(y_test, y_pred_svc_linear))
print("F2:", fbeta_score(y_test, y_pred_svc_linear, beta=2))

Linear SVC: Run-time 6mins

In [None]:
from sklearn.svm import SVC

# Assuming 'X_train' and 'y_train' are your training data and labels
svc_model_rbf = SVC(kernel='rbf', gamma=0.1)
svc_model_rbf.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_svc_rbf = svc_model_rbf.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_svc_rbf))
print("Classification Report:")
print(classification_report(y_test, y_pred_svc_rbf))
print("F1:", f1_score(y_test, y_pred_svc_rbf))
print("F2:", fbeta_score(y_test, y_pred_svc_rbf, beta=2))

RBF SVC: Runtime 3mins

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Assuming 'X_train' and 'y_train' are your training data and labels
gbc_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.01)
gbc_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_gbc = gbc_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_gbc))
print("Classification Report:")
print(classification_report(y_test, y_pred_gbc))
print("F1:", f1_score(y_test, y_pred_gbc))
print("F2:", fbeta_score(y_test, y_pred_gbc, beta=2))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Assuming 'X_train' and 'y_train' are your training data and labels
gbc_model_2 = GradientBoostingClassifier(n_estimators=2000, learning_rate=0.001, max_depth=2, subsample=0.8)
gbc_model_2.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_gbc_2 = gbc_model_2.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_gbc_2))
print("Classification Report:")
print(classification_report(y_test, y_pred_gbc_2))
print("F1:", f1_score(y_test, y_pred_gbc_2))
print("F2:", fbeta_score(y_test, y_pred_gbc, beta=2_2))

In [None]:
# # Perform randomsearch
params_gbc = {'n_estimators': list(range(1000,3000)),'learning_rate': [0.01,0.001],'max_depth': list(range(1,5)),'subsample': [0.5,0.8,1]}

from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

randomsearch_gbc = RandomizedSearchCV(gbc_model_2, params_gbc, cv=5, n_iter=500, n_jobs=-1, scoring=ftwo_scorer, random_state=1234, verbose=2)
randomsearch_gbc.fit(X_train_transformed, y_train_transformed)

print("Best parameters from RandomSearch: ", randomsearch_gbc.best_params_)

In [None]:
from sklearn.naive_bayes import GaussianNB

# Assuming 'X_train' and 'y_train' are your training data and labels
nb_model = GaussianNB()
nb_model.fit(X_train_transformed.toarray(), y_train_transformed)

# Make predictions on the test set
y_pred_nb = nb_model.predict(X_test_transformed.toarray())

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("F1:", f1_score(y_test, y_pred_nb))
print("F2:", fbeta_score(y_test, y_pred_nb, beta=2))

In [None]:
# # # Perform randomsearch
# params_gbc = {'n_estimators': list(range(50,100)),'learning_rate': [0.1,0.2],'max_depth': list(range(1,5))}

# from sklearn.metrics import fbeta_score, make_scorer
# ftwo_scorer = make_scorer(fbeta_score, beta=2)

# randomsearch_gbc = RandomizedSearchCV(gbc_model, params_gbc, cv=5, n_iter=300, scoring=ftwo_scorer, random_state=1234)
# randomsearch_gbc.fit(X_train_transformed, y_train_transformed)

# print("Best parameters from RandomSearch: ", randomsearch_gbc.best_params_)

Above code is to explore the hyperparameter tunning for GBC on n_estimators, learning rate and max depth. n_iternation is set at 300 since we want to cover majority (~75%) of all the combinations (51*2*4 = 408). The code are commented due to the long runtime (~160mins) and the results is pasted as below.

Best parameters from RandomSearch:  {'n_estimators': 92, 'max_depth': 4, 'learning_rate': 0.2}

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Assuming 'X_train' and 'y_train' are your training data and labels
gbc_model_best = GradientBoostingClassifier(n_estimators=92, max_depth=4, learning_rate=0.2)
gbc_model_best.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_gbc_best = gbc_model_best.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_gbc_best))
print("Classification Report:")
print(classification_report(y_test, y_pred_gbc_best))
print("F1:", f1_score(y_test, y_pred_gbc_best))
print("F2:", fbeta_score(y_test, y_pred_gbc_best, beta=2))

In [None]:
# Perform cross-validation
cv_scores_gbc = cross_val_score(gbc_model, X_train_transformed, y_train_transformed, cv=5, scoring=ftwo_scorer)

print("Cross-validation scores for gbc: ", cv_scores_gbc)

# Evaluate on the test set
y_pred_gbc = gbc_model.predict(X_test_transformed)
print("Test accuracy for gbc: ", recall_score(y_test, y_pred_gbc))
print("F1:", f1_score(y_test, y_pred_gbc))
print("F2:", fbeta_score(y_test, y_pred_gbc, beta=2))

In [None]:
# Perform cross-validation
cv_scores_gbc_best = cross_val_score(gbc_model_best, X_train_transformed, y_train_transformed, cv=5, scoring=ftwo_scorer)

print("Cross-validation scores for gbc: ", cv_scores_gbc_best)

# Evaluate on the test set
y_pred_gbc_best = gbc_model_best.predict(X_test_transformed)
print("Test accuracy for gbc: ", recall_score(y_test, y_pred_gbc_best))
print("F1:", f1_score(y_test, y_pred_gbc_best))
print("F2:", fbeta_score(y_test, y_pred_gbc_best, beta=2))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Assuming 'X_train' and 'y_train' are your training data and labels
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train_transformed, y_train_transformed)

# Make predictions on the test set
y_pred_rfc = rfc_model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_rfc))
print("Classification Report:")
print(classification_report(y_test, y_pred_rfc))
print("F1:", f1_score(y_test, y_pred_rfc))
print("F2:", fbeta_score(y_test, y_pred_rfc, beta=2))

### The below code is to draw the detection error tradeoff curve which shows the false negative on the y-axis. The model performs the best if the curve is closer to the bottom-left corner.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import det_curve

def plot_det_curve_for_model(model, X_test, y_test, model_name):
    fpr, fnr, thresholds = det_curve(y_test, model.predict_proba(X_test)[:, 1])
    plt.plot(fpr, fnr, label=model_name)

# Assuming you have a list of models named 'models_list', where each element is a tuple (model, model_name)
models_list = [(gbc_model_best, "GBC_RandomSearchCV"), (gbc_model, "GBC_default"), (lr_model, "LogistricRegression"), 
               (knn_model, "KNN_model"), (nb_model, "NaiveBayes_model")]

# Plotting the DET curves for all models
plt.figure(figsize=(8, 6))
for model, model_name in models_list:
    plot_det_curve_for_model(model, X_test_transformed, y_test, model_name)

# Additional customizations for the plot
plt.title("Detection Error Tradeoff (DET) curves")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("False Negative Rate (FNR)")
plt.legend()
plt.grid(True)
plt.show()
