# Model tuning

## Installing and importing modules

In [2]:
pip install pandas matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split

## Read the dataset

In [46]:
ds = pd.read_csv("prepared_dataset.csv")

## Model tuning

#### Define target and features columns

In [47]:
y_column = ['Status']  # target variable
X_columns = [
    'loan_limit', 'approv_in_adv', 'Credit_Worthiness',
    'business_or_commercial', 'Neg_ammortization', 'interest_only',
    'lump_sum_payment', 'construction_type', 'income',
    'co-applicant_credit_type', 'age', 'submission_of_application',
    'loan_type_type2', 'loan_type_type3', 'occupancy_type_pr',
    'credit_type_EQUI'
]
X = ds[X_columns]
y = ds[y_column]

In [48]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
# In the first step we will split the data in training and remaining dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(115452, 16)
(115452, 1)
(28864, 16)
(28864, 1)


#### Building a Baseline Gradient Boosting Model
First, we will build a baseline Gradient Boosting model that will serve as a comparison with the model using the optimal set of hyperparameters.

In [49]:
gb = GradientBoostingClassifier(n_estimators=50, max_depth=3, learning_rate=0.1)

In [50]:
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

  y = column_or_1d(y, warn=True)


In [51]:
print('test set metrics: ', metrics.classification_report(y_test, y_pred))

test set metrics:                precision    recall  f1-score   support

           0       0.86      0.99      0.92     21765
           1       0.96      0.50      0.66      7099

    accuracy                           0.87     28864
   macro avg       0.91      0.75      0.79     28864
weighted avg       0.88      0.87      0.86     28864



In [52]:
gb.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

#### Hyperparameter Tuning

n_estimators - кількість дерев рішень, які використовуються для побудови моделі. 

max_depth - максимальна глибина кожного дерева.

learning_rate - швидкість навчання, яка визначає внесок кожного дерева до загальної моделі. 

min_samples_split - мінімальна кількість зразків, необхідних для розбиття внутрішнього вузла. 

min_samples_leaf - мінімальна кількість зразків, необхідних для формування листка дерева.

In [53]:
%%time
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [2, 3, 4],
    'learning_rate': [0.05, 0.1],
    'min_samples_split': [10, 15],
    'min_samples_leaf': [5, 7]
}


grid = GridSearchCV(estimator=gb, param_grid=param_grid, scoring='f1', cv=5, verbose=3, return_train_score=True, n_jobs=-1)

grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %.2f" % (grid.best_params_, grid.best_score_))

Fitting 5 folds for each of 72 candidates, totalling 360 fits


  y = column_or_1d(y, warn=True)


The best parameters are {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 75} with a score of 0.66
CPU times: total: 8.47 s
Wall time: 1min 54s


In [54]:
# Creating a DataFrame to view grid search results
grid_results = pd.concat([
    pd.DataFrame(grid.cv_results_["params"]),
    pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["f1-score"])], axis=1)

grid_results

Unnamed: 0,learning_rate,max_depth,min_samples_leaf,min_samples_split,n_estimators,f1-score
0,0.05,2,5,10,25,0.596356
1,0.05,2,5,10,50,0.602524
2,0.05,2,5,10,75,0.631419
3,0.05,2,5,15,25,0.596356
4,0.05,2,5,15,50,0.602524
...,...,...,...,...,...,...
67,0.10,4,7,10,50,0.656698
68,0.10,4,7,10,75,0.659014
69,0.10,4,7,15,25,0.650812
70,0.10,4,7,15,50,0.656698


In [55]:
grid_results.columns

Index(['learning_rate', 'max_depth', 'min_samples_leaf', 'min_samples_split',
       'n_estimators', 'f1-score'],
      dtype='object')

In [56]:
# Grouping results to find the mean score for each parameter combination
grid_contour = grid_results.groupby(['learning_rate', 'max_depth', 'min_samples_leaf', 'min_samples_split', 'n_estimators']).mean()
print(grid_contour)

                                                                         f1-score
learning_rate max_depth min_samples_leaf min_samples_split n_estimators          
0.05          2         5                10                25            0.596356
                                                           50            0.602524
                                                           75            0.631419
                                         15                25            0.596356
                                                           50            0.602524
...                                                                           ...
0.10          4         7                10                50            0.656698
                                                           75            0.659014
                                         15                25            0.650812
                                                           50            0.656698
                

#### Висновок

Найкращий F1-score (0.66) був досягнутий при learning_rate = 0.10, max_depth = 4, min_samples_leaf = 7, min_samples_split = 10, і n_estimators = 75. Найбільший вплив на результат моделі мали гіперпараметри learning_rate, max_depth та n_estimators. Це свідчить про те, що комбінація більшої кількості дерев, глибшого дерева, і підвищеної швидкості навчання дає найкращі результати для вашого набору даних.