# Model development and evaluation

The objective of this notebook is to engineer ml models and test against preprocessed data to gather the best f1 score metric

## List of candidates:

1 - RandomForestClassifier
2 - XGBClassifier
3 - RidgeClassifier
4 - SVC
5  - KNeighborsClassifier
6 - LogisticRegression
7 - DecisionTreeClassifier

In [29]:
#---------Importing libraries---------#

#---Data analysis---#
import pandas as pd
import numpy as np


#---Data splitting---#
from sklearn.model_selection import train_test_split


#---evaluation---#
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

#---visualization---#
import matplotlib.pyplot as plt

#---utils---#
import os

#---data---#
df_test = pd.read_csv('../data/preprocessed/df_test.csv')
df_train = pd.read_csv('../data/preprocessed/df_train.csv')


#---------Models---------#
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [30]:
#dataset shapes
print("Train data shape: ", df_train.shape)
print("Test data shape: ", df_test.shape)

Train data shape:  (4510, 11)
Test data shape:  (7668, 11)


In [35]:
# test data
X_test = df_test.drop('y', axis=1)
y_test = df_test['y']

In [31]:
# matrix and vector data undersampled

X = df_train.drop('y', axis=1)
y = df_train['y']


In [32]:
# Random forest parameters
random_forest_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}


# XGBoost parameters
xgboost_params = {
    'n_estimators': [100, 200, 400, 800, 1600],
    'learning_rate': [0.01, 0.1, 1.0],
    'max_depth': [2, 4, 8, 16, 32],
    'gamma': [0, 0.1, 0.5, 1.0],
    'subsample': [0.5, 0.75, 1.0],
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss']
}

# Ridge classifier parameters
ridge_params = {
    'alpha': [0.1, 1.0, 10.0],
    
    'fit_intercept': [True, False],
    'solver': [  'cholesky', 'lsqr', 'sparse_cg',  'lbfgs'],
    'positive': [True]
}

# Support vector classifier parameters
svc_params = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
    
}

# Logistic regression parameters
logistic_params = {
    'penalty': ['l2', 'none'],
    'C': [0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'solver': ['newton-cg','liblinear'],
    'l1_ratio': [0.0, 0.5, 1.0]
}

# K-nearest neighbors parameters
knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50],
    'p': [1, 2]
}

# decision tree parameters
decision_tree_params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2'],
    'max_leaf_nodes': [3, 10, 20],
    'min_impurity_decrease': [0.01, 0.1, 0.2]
}


# Create a list of tuples where each tuple contains (model, parameter_grid)
models_and_params = [
    (LogisticRegression(), logistic_params),
    (KNeighborsClassifier(), knn_params),
    (DecisionTreeClassifier(), decision_tree_params),
    (RandomForestClassifier(), random_forest_params),
    (XGBClassifier(), xgboost_params),

    
]



In [37]:
#list of the candidates models with best hyperparameters for each 
best_models = []
scores = []
# Iterate through models and parameters
for model, param_grid in models_and_params:
    print(f"Model: {model.__class__.__name__}")
    
    # Create a StratifiedKFold cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # good for representing the data set equally for each class as
                                                                    # they are imbalanced ( new data in the future per se)
    
    # Create a custom scorer using f1_score
    custom_scorer = make_scorer(f1_score)
    
    # Create GridSearchCV
    grid_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        scoring=custom_scorer,
        cv=cv,
        verbose=1,
        n_jobs=-1,
        n_iter=10,
        random_state=42
    )
    
    # Fit the GridSearchCV object
    grid_search.fit(X, y)  # X is your feature matrix, y is your target vector
    
    # Print the best parameters and F1 score
    print("Best Parameters:", grid_search.best_params_)
    print('Best estimator: ', grid_search.best_estimator_)
    print("Best F1 Score on validation set: {:.2%}".format(grid_search.best_score_))
    best_models.append(grid_search.best_estimator_)
    scores.append(grid_search.best_score_)
    print('----------------------------------------------------------')



Model: LogisticRegression
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best Parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'l1_ratio': 0.0, 'fit_intercept': False, 'C': 0.1}
Best estimator:  LogisticRegression(C=0.1, fit_intercept=False, l1_ratio=0.0, solver='newton-cg')
Best F1 Score on validation set: 81.64%
----------------------------------------------------------
Model: KNeighborsClassifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 7, 'leaf_size': 10, 'algorithm': 'ball_tree'}
Best estimator:  KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=7)
Best F1 Score on validation set: 77.79%
----------------------------------------------------------
Model: DecisionTreeClassifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'splitter': 'random', 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.01, 'max_leaf_nodes': 3, 'max_features': 'log2', 'max_depth': 10, 'criterion': 'entropy'}
Best estimat

45 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\joao-lemos\AppData\Local\anaconda3\envs\jao\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\joao-lemos\AppData\Local\anaconda3\envs\jao\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\joao-lemos\AppData\Local\anaconda3\envs\jao\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1446, in fit
    super().fit(X, Y, sample_weight=sam

## Performance analysis on the test set

In [38]:
test_scores = []

for model in best_models:
    print(model)
    print('------------------')
    print('Test score: ', model.score(X_test, y_test))
    print('------------------ \n')
    test_scores.append(model.score(X_test, y_test))
    

LogisticRegression(C=0.1, fit_intercept=False, l1_ratio=0.0, solver='newton-cg')
------------------
Test score:  0.837376108502869
------------------ 

KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=7)
------------------
Test score:  0.7952529994783516
------------------ 

DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='log2',
                       max_leaf_nodes=3, min_impurity_decrease=0.01,
                       splitter='random')
------------------
Test score:  0.9299687010954617
------------------ 

RandomForestClassifier(max_depth=10, min_samples_leaf=2)
------------------
Test score:  0.8495044340114762
------------------ 

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0, gpu_id=None, grow_p

In [45]:
# best models names
models_names = []
for model in best_models:
    print(model.__class__.__name__)
    models_names.append(model.__class__.__name__)

LogisticRegression
KNeighborsClassifier
DecisionTreeClassifier
RandomForestClassifier
XGBClassifier
RidgeClassifier


In [48]:
performance = pd.DataFrame({'model': models_names, 'score': scores, 'test_score': test_scores})

performance.sort_values(by='test_score', ascending=False)



Unnamed: 0,model,score,test_score
2,DecisionTreeClassifier,0.359822,0.929969
5,RidgeClassifier,0.76509,0.878326
3,RandomForestClassifier,0.858253,0.849504
4,XGBClassifier,0.859273,0.847679
0,LogisticRegression,0.816426,0.837376
1,KNeighborsClassifier,0.777901,0.795253


# Conclusion

In [50]:
best_models[3]

In [51]:
best_models[4]


Random forest and XGBClassifier were the best, because the performe well both on inbalanced test data as well as balanced training data. ALso, the train and test socres are similar, with train score a little higher as espected (?).

Finally, they beat the scoreline proposed for this project, which was 81%.

In [60]:
#saving best models
import pickle


pickle.dump(best_models[3], open('../models/random_forest_classifier_model.pkl','wb'))

file_name = 'XGBoost_model.pkl'
pickle.dump(best_models[4], open('../models/'+file_name,'wb'))


#loading best models
rf = pickle.load(open('../models/random_forest_classifier_model.pkl','rb'))
xgb = pickle.load(open('../models/'+file_name,'rb'))


In [57]:
a = '../models/' + file_name 

In [58]:
a

'../data/models/XGBoost_model.pkl'