# Optymalizacja hiperparametrów w bibliotece Optuna

Zbiór danych do analizy: https://www.kaggle.com/datasets/prishasawhney/mushroom-dataset

Mamy dane dotyczące grzybów. Model ma za zadanie ocenić, czy grzyb jest jadalny.
Cel biznesowy: Stworzenie aplikacji, która pomoże użytkownikowi w ocenie czy grzyb jest jadalny, poprawiając bezpieczeństwo.

Zmienne:
- Cap Diameter
- Cap Shape
- Gill Attachment
- Gill Color
- Stem Height
- Stem Width
- Stem Color
- Season
- Target Class - Is it edible or not?

In [1]:
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import recall_score, accuracy_score

In [2]:
# puść ten kod, 
# jeżeli wywołujesz plik  w folderze rozwiąznaia, 
# a ramka danych znajduje się w folderze data
import os 
os.chdir('../')

In [3]:
df = pd.read_csv('data/mushroom.csv')

In [None]:
# nagłówek 
df.head()

In [None]:
# describe
df.describe()

In [6]:
# Podział na zbiór treningowy i testowy
train_x, test_x, train_y, test_y = train_test_split(df.drop('class', axis=1),df['class'],test_size=0.2,random_state=1000)

In [7]:
# Podział na zbiór treningowy i walidacyjny
train_x,valid_x, train_y, valid_y = train_test_split(train_x,train_y, test_size=0.2, random_state=1000)

In [None]:
DecisionTreeClassifier()

In [8]:
# Objective
def objective(trial: optuna.Trial):
    criterion = trial.suggest_categorical('criterion',['gini','entropy','log_loss'])
    max_depth = trial.suggest_int('max_depth',3,30)
    min_samples_split = trial.suggest_int('min_samples_split',5,100)
    max_leaf_nodes  =trial.suggest_int('max_leaf_nodes',20,100)
    min_impurity_decrease = trial.suggest_float('min_impurity_decrease',0,1)

    model = DecisionTreeClassifier(criterion=criterion,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   max_leaf_nodes=max_leaf_nodes,
                                   min_impurity_decrease=min_impurity_decrease).fit(train_x,train_y)
    preds = model.predict(test_x)
    return recall_score(test_y, preds)


In [None]:
# Stworzenie optymalizacji
study  = optuna.create_study(direction='maximize',study_name='DT_optimization')

In [None]:
study.optimize(objective, n_trials=20)

In [None]:
# Parametry optymalizacji
study.best_params

In [12]:
# model
model = DecisionTreeClassifier(**study.best_params).fit(train_x,train_y)


In [None]:
# parametry modelu
model.get_params()

In [14]:
# predykcja
valid_pred = model.predict(valid_x)

In [None]:
# Waznosc zmiennych
model.feature_importances_

In [None]:
# Predykcje - value counts
pd.Series(valid_pred).value_counts()

In [17]:
from sklearn.tree import plot_tree

In [None]:
plot_tree(model)

In [None]:
# ocena


In [24]:
# Objective function
def objective(trial: optuna.Trial):
    model_type = trial.suggest_categorical('model_type',['DT','LR'])
    if model_type=='DT':
        params = {
            'criterion': trial.suggest_categorical('criterion',['gini','entropy','log_loss']),
            'max_depth': trial.suggest_int('max_depth',3,30),
            'min_samples_split': trial.suggest_int('min_samples_split',5,100),
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes',20,100),
            'min_impurity_decrease': trial.suggest_float('min_impurity_decrease',0,0.01)
        }
        model = DecisionTreeClassifier(**params).fit(train_x,train_y)
    else:
        params = {'penalty': trial.suggest_categorical('penalty',['l1','l2']),
                  'Cs': trial.suggest_int('Cs',0,50)}
        model=LogisticRegressionCV(**params,max_iter=10000,cv=3,solver ='liblinear').fit(train_x,train_y)
    preds = model.predict(test_x)
    return recall_score(test_y,preds)


In [None]:
# Stworzenie optymalizacji
study = optuna.create_study(direction='maximize',study_name='DT_LR_optimization')
study.optimize(objective,n_trials=20)

In [26]:
# Przypisanie parametrow do zmiennej
best_params = study.best_params

In [None]:
# model type
best_params['model_type']

In [28]:
# usuniecie model type
del best_params['model_type']

In [None]:
best_params

In [30]:
# model
model = DecisionTreeClassifier(**best_params).fit(train_x,train_y)

In [31]:
# predykcja
valid_pred = model.predict(valid_x)

In [None]:
# Accuracy 
accuracy_score(valid_y,valid_pred)

In [None]:
# recall
recall_score(valid_y,valid_pred)