# Model development and evaluation

The objective of this notebook is to engineer ml models and test against preprocessed data to gather the best f1 score metric

## List of candidates:

1 - RandomForestClassifier
2 - XGBClassifier
3 - RidgeClassifier
4 - SVC

In [1]:
#---------Importing libraries---------#

#---Data analysis---#
import pandas as pd
import numpy as np


#---Data splitting---#
from sklearn.model_selection import train_test_split

#---classification models---#
from sklearn.linear_model import LogisticRegression

#---evaluation---#
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

#---visualization---#
import matplotlib.pyplot as plt

#---utils---#
import os

#---data---#
# Define the relative path to your CSV file in the data folder
relative_path = '../data/preprocessed/df_smoted.csv'
# Read the CSV file
df = pd.read_csv(relative_path)


#---------Models---------#
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

In [2]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,y
0,2,4,1,3,0,2143,1,0,5,5,261,1,0
1,2,9,2,2,0,29,1,0,5,5,151,1,0
2,2,2,1,2,0,2,1,1,5,5,76,1,0
3,2,4,1,3,0,231,1,0,5,5,139,1,0
4,1,4,2,3,0,447,1,1,5,5,217,1,0


In [3]:
# matrix and vector data
X = df.drop('y', axis=1)
y = df['y']


In [10]:
# Random forest parameters
random_forest_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}


# XGBoost parameters
xgboost_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4],
    'min_child_weight': [1, 3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1],
    'scale_pos_weight': [1, 2],
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss']
}

# Ridge classifier parameters
ridge_params = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}

# Support vector classifier parameters
svc_params = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1.0],
    'degree': [2, 3, 4],
    'coef0': [0.0, 1.0]
}

# Create a list of tuples where each tuple contains (model, parameter_grid)
models_and_params = [
    
    
    (RidgeClassifier(), ridge_params),
    (SVC(), svc_params)
]

models_and_complex_params = [
    (RandomForestClassifier(), random_forest_params),
    (XGBClassifier(), xgboost_params)
                             ]

In [None]:
# Iterate through models and parameters
for model, param_grid in models_and_complex_params:
    print(f"Model: {model.__class__.__name__}")
    
    # Create a StratifiedKFold cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # good for representing the data set equally for each class as
                                                                    # they are imbalanced ( new data in the future per se)
    
    # Create a custom scorer using f1_score
    custom_scorer = make_scorer(f1_score)
    
    # Create GridSearchCV
    grid_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        scoring=custom_scorer,
        cv=cv,
        verbose=1,
        n_jobs=-1,
        n_iter=20,
        random_state=42
    )
    
    # Fit the GridSearchCV object
    grid_search.fit(X, y)  # X is your feature matrix, y is your target vector
    
    # Print the best parameters and F1 score
    print("Best Parameters:", grid_search.best_params_)
    print("Best F1 Score:", grid_search.best_score_)
    print()



Model: XGBClassifier
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 2, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'objective': 'binary:logistic', 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0, 'eval_metric': 'logloss', 'colsample_bytree': 1.0}
Best F1 Score: 0.9184835992354359

Model: RandomForestClassifier


TypeError: GridSearchCV.__init__() got an unexpected keyword argument 'random_state'

In [11]:
# Iterate through models and parameters
for model, param_grid in models_and_params:
    print(f"Model: {model.__class__.__name__}")
    
    # Create a StratifiedKFold cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # good for representing the data set equally for each class as
                                                                    # they are imbalanced ( new data in the future per se)
    
    # Create a custom scorer using f1_score
    custom_scorer = make_scorer(f1_score)
    
    # Create GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=custom_scorer,
        cv=cv,
        verbose=1,
        n_jobs=-1, 
        
    )
    
    # Fit the GridSearchCV object
    grid_search.fit(X, y)  # X is your feature matrix, y is your target vector
    
    # Print the best parameters and F1 score
    print("Best Parameters:", grid_search.best_params_)
    print("Best F1 Score:", grid_search.best_score_)
    print()

Model: RidgeClassifier
Fitting 5 folds for each of 48 candidates, totalling 240 fits


# Conclusion