## Hyperparameters tuning

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
from joblib import load
import os
import sys
from sklearn.preprocessing import LabelEncoder
sys.path.insert(0, os.path.abspath(os.path.join("..", "transformers")))
from column_drop import *

In this final part we will try different methods of finding optimal hyperparameters for 3 chosen models, that are:
1. XGboost
2. ...
3. ...

## Data transforming 

In [2]:
# loading train and test data
X_train_data={}
Y_train_data={}

X_test_data={}
Y_test_data={}

for i in range(1, 7):  # Zakładam, że masz 6 zestawów X i Y
    try:
        X_train_data[f'X{i}'] = pd.read_csv(os.path.join('../data/train', f'X{i}_train.csv'))  
        Y_train_data[f'Y{i}'] = pd.read_csv(os.path.join('../data/train', f'Y{i}_train.csv'))

        X_test_data[f'X_test{i}'] = pd.read_csv(os.path.join('../data/test', f'X{i}_test.csv')) 
        Y_test_data[f'Y_test{i}'] = pd.read_csv(os.path.join('../data/test', f'Y{i}_test.csv')) 
    except FileNotFoundError as e:
        print(f"File not found: {e}")


In [3]:
preprocessing_pipeline = load('../pipelines/preprocessing_pipeline.joblib')
preprocessing_pipeline

In [5]:
transformed_X_train={}
transformed_Y_train={}
transformed_X_test={}
transformed_Y_test={}

preprocessing_pipeline = load('../pipelines/preprocessing_pipeline.joblib')

for i in range(1, 7):
    X_train = X_train_data.get(f'X{i}') # pobieranie ramek, ktore bedziemy przetwarzać w pipeline
    Y_train = Y_train_data.get(f'Y{i}')

    X_test = X_test_data.get(f'X_test{i}')
    Y_test = Y_test_data.get(f'Y_test{i}')
    
    if X_train is not None and Y_train is not None:
        preprocessing_pipeline = load('../pipelines/preprocessing_pipeline.joblib')
        # Sprawdzenie typu Y i ewentualny encoding na numeryczny
        if Y_train['target'].dtype == 'object':
            # Y jest kategoryczne, wykonaj kodowanie
            label_encoder = LabelEncoder()
            Y_train_encoded = label_encoder.fit_transform(Y_train['target']) # fit_transform na train
            Y_test_encoded = label_encoder.transform(Y_test['target']) # samo transform na test
        else:
            # Y jest binarne (0-1), nie wymagane przetwarzanie
            Y_train_encoded = Y_train['target'].values  # Zamień na array
            Y_test_encoded = Y_test['target'].values

        # Przekształcanie danych X
        X_train_transformed = preprocessing_pipeline.fit_transform(X_train)  # fit_transform na train
        X_test_transformed = preprocessing_pipeline.transform(X_test) # transform na test

        # Przechowywanie przetworzonych danych w słownikach
        transformed_X_train[f'X{i}_transformed'] = pd.DataFrame(X_train_transformed)
        transformed_Y_train[f'Y{i}_transformed'] = Y_train_encoded

        transformed_X_test[f'X{i}_test_transformed'] = pd.DataFrame(X_test_transformed)
        transformed_Y_test[f'Y{i}_test_transformed'] = Y_test_encoded

        

In [7]:
transformed_Y_test #sprawdzenie

{'Y1_test_transformed': array([0, 1, 0, ..., 1, 0, 0]),
 'Y2_test_transformed': array([0, 1, 1, ..., 0, 1, 1]),
 'Y3_test_transformed': array([1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
        0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
        0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
        1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1

## Param Grids

In [None]:
# można zrobić optuną czy lepiej?

#wartości do zmiany
XGBoost_grid ={
    'n_estimators': [i for i in range(1,500)],         
    'eta': np.arange(0,1, step=0.1),            
    'subsample': np.arange(0,1, step=0.1),        
    'booster': ['gbtree', 'gblinear', 'dart'],      
    'max_depth': [i for i in range(1,10)],                  
    'min_child_weight': [i for i in range(1,7)],               
    'colsample_bytree': np.arange(0,1, step=0.1),      
    'colsample_bylevel': np.arange(0,1, step=0.1),     
    'reg_lambda': [0.1, 1, 10, 20],    #idk             
    'reg_alpha': [0, 0.1, 1, 10] 
}
    