In [None]:
# установка катбуста и активация нужных параметров
# !pip install catboost
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("./data/mushrooms_small.tsv", sep='\t')
df

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# сколько категорий в каждом столбце
df.nunique()

In [None]:
df.isna().sum()

In [None]:
X, y = df.drop(columns='class'), df['class'] # отделим лейблы

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3+2+2024)

# Обучим как-то простую модель

In [None]:
y.value_counts()

In [None]:
X_encoded = pd.get_dummies(X, drop_first=True, dtype=int)
y_encoded = y.map({'p':1, 'e':0})

In [None]:
X_encoded.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.25, random_state=3+2+2024)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import f1_score

In [None]:
models = dict(
    LogReg = LogisticRegression(),
    KNN = KNeighborsClassifier(),
    SVM = SVC(),
    Tree = DecisionTreeClassifier(),
    RandomForest = RandomForestClassifier()
)

In [None]:
prefit = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    prefit[model_name] = dict(model=model, score=f1_score(y_test, y_pred))
prefit

In [None]:
imp = pd.DataFrame({'cols': X_train.columns, 'coef':prefit['LogReg']['model'].coef_[0]})
imp['abs'] = imp.coef.abs()
imp = imp.sort_values(by='abs', ascending=False)
imp.head(10)

# Catboost

In [None]:
import catboost
catboost.__version__

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
# создаём простую модель
# https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier

model = CatBoostClassifier(iterations=100,
                           depth=3,
                           learning_rate=0.5,
                   # если класс -- лейбл, используем LogLoss. Если класс -- вероятность, используем кросс-энтропию
                           loss_function='Logloss', 
                           verbose=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=3+2+2024)

In [None]:
# подготовим данные в нужном формате
data_train = Pool(data=X_train, label=y_train, cat_features=X_train.columns.values)
data_test = Pool(data=X_test, label=y_test, cat_features=X_test.columns.values)

In [None]:
# учим модель
model.fit(data_train)

In [None]:
# реже выводим информацию
model.fit(data_train, 
         metric_period = 10)

In [None]:
# интерактивный монитор
model.fit(data_train, 
          plot=True, 
          silent=True)

In [None]:
print("Model is fitted: {}".format(model.is_fitted()))
print(model.get_params())
print("Best score: {}".format(model.best_score_))

## Добавим в монитор другие метрики

In [None]:
# учим модель
model = CatBoostClassifier(iterations=100,
                                   depth=3,
                                   learning_rate=0.5,
                                   loss_function='Logloss', 
                                   custom_loss=['AUC', 'F1', 'Accuracy'])

model.fit(data_train, 
          plot=True, 
          silent=True)
         

## Добавим валидацию

In [None]:
# учим модель
model = CatBoostClassifier(iterations=100,
                                   depth=3,
                                   learning_rate=0.5,
                                   loss_function='Logloss', 
                                   custom_loss=['AUC', 'F1', 'Accuracy'])

model.fit(data_train, 
          # plot=True, 
          # silent=True,
          eval_set=data_test)
         

In [None]:
model.fit(data_train, eval_set=data_test, metric_period=10)

In [None]:
# увеличим число деревьев в два раза, до 200
model = CatBoostClassifier(iterations=200,
                                   depth=3,
                                   learning_rate=0.5,
                                   loss_function='Logloss', 
                                   eval_metric='F1')

model.fit(data_train, 
          # plot=True, 
          # silent=True,
          eval_set=data_test,
          metric_period = 25)
         

In [None]:
# Оптимизируется logloss, но наши целевые метрики не растут с одной из самых ранних итераций

In [None]:
# поменяем оптимизируемую метрику на AUC
model = CatBoostClassifier(iterations=100,
                                   depth=3,
                                   learning_rate=0.5,
                                   loss_function='Logloss', 
                                   eval_metric='AUC')

# model.fit(data_train, 
#           plot=True, 
#           silent=True,
#           eval_set=data_test)
         

In [None]:
# поменяем оптимизируемуб метрику на AUC
model = CatBoostClassifier(iterations=200,
                                   depth=3,
                                   learning_rate=0.5,
                                   loss_function='Logloss', 
                                   eval_metric='F1')

# добавим остановку обучения, если целевая метрика НЕ МЕНЯЕТСЯ в течение некоторого числа итераций
model.fit(data_train, 
          eval_set=data_test,
          early_stopping_rounds=20)
         

# Сравним модели

In [None]:
model1 = CatBoostClassifier(iterations=100,
                            depth=3,
                            learning_rate=0.5,
                            loss_function='Logloss', 
                            eval_metric='F1',
                            train_dir='lr0.5')
model2 = CatBoostClassifier(iterations=100,
                            depth=3,
                            learning_rate=0.01,
                            loss_function='Logloss', 
                            eval_metric='F1',
                            train_dir='lr1')


model1.fit(data_train, 
           eval_set=data_test,
           silent=True,)
model2.fit(data_train, 
           eval_set=data_test,
           silent=True,)

In [None]:
from catboost import MetricVisualizer
MetricVisualizer(['lr0.5', 'lr1']).start()

# Кросс-валидация

In [None]:
from catboost import cv

params = {'loss_function': 'Logloss',
          'iterations': 100,
          'eval_metric': 'F1',
          'learning_rate':0.2}

cv_data = cv(
    params = params,
    pool = data_train,
    fold_count = 5,
    shuffle = True,
    partition_random_seed = 42,
    verbose=False,
   # plot = True,
   # stratified=True # пропорции в каждом фолде такие же, как и во всех данных
)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

param_grid = {
    "learning_rate": [0.0001, 0.01, 0.25, 0.7]
}

model = CatBoostClassifier(
    iterations = 50,
    metric_period=50,
    verbose=True,
    eval_metric='F1',
    cat_features=X_train.columns.values
)

scorer = make_scorer(f1_score, greater_is_better=True)

grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, verbose=True, n_jobs=1, scoring=scorer)
results = grid_search.fit(X_train, y_train)

In [None]:
results.best_estimator_

In [None]:
results.best_estimator_.get_params()

# Предскажем и оценим

In [None]:
model = CatBoostClassifier(
    eval_metric='F1',
    iterations=200,
    learning_rate=0.5,
    early_stopping_rounds=20
)

model.fit(
    data_test,
    eval_set=data_test,
#     silent=True,
#     plot=True
)

In [None]:
model.get_params()

In [None]:
model.predict(X_test)

In [None]:
model.predict_proba(X_test) # вероятности (получили путём применения сигмоиды к полученным значениями в листьях)

In [None]:
# сырые значения функции предсказания
model.predict(X_test, prediction_type='RawFormulaVal')

$$ S(x) = \frac{1}{1 + e^{-x}}$$

In [None]:
sigmoid = lambda x : 1 / (1 + np.exp(-x)) # написали сами сигмоиду

raw = model.predict(X_test, prediction_type='RawFormulaVal')

probabilities = sigmoid(raw)

In [None]:
(model.predict_proba(X_test)[:, 1] != probabilities).sum() # вероятности принадлежать целевому классу совпали везде

 # Feature importance

In [None]:
model.get_feature_importance(prettified=True)

In [None]:
model.get_feature_importance(prettified=True).Importances.sum()

# SHAP

https://medium.com/dataman-in-ai/explain-your-model-with-the-shap-values-bc36aac4de3d
<img src="https://raw.githubusercontent.com/slundberg/shap/master/docs/artwork/mnist_image_plot.png" />

In [None]:
# !pip install shap

In [None]:
shap_values = model.get_feature_importance(data_test, 'ShapValues')
shap_values

In [None]:
X_test.shape

In [None]:
shap_values.shape

In [None]:
expected_value = shap_values[1, -1]
expected_value

In [None]:
shap_values = shap_values[:, :-1]

In [None]:
shap_values[1, :].shape

In [None]:
y_test.values[1]

In [None]:
import shap

shap.initjs()
shap.force_plot(expected_value, shap_values[1, :], X_test.iloc[1, :])

In [None]:
y_test.values[5]

In [None]:
shap.initjs() # пример, где признаки вносят "отрицательный" вклад
shap.force_plot(expected_value, shap_values[5, :], X_test.iloc[5, :])

In [None]:
shap_values = model.get_feature_importance(data_test, 'ShapValues')
shap_values

In [None]:
shap_values = model.get_feature_importance(data_test, 'ShapValues')
shap_values = shap_values[:, :-1] # последний столбец -- среднее значение "функции" (оно одинаково для всех. это свободный член)
shap.summary_plot(shap_values, X_test)

# окраски нет, потому что данные категориальные, а их кодирование в числа происходит внутри 

In [None]:
X_test.shape

# Практика

In [None]:
from sklearn.datasets import load_digits # MNIST

In [None]:
X, y = load_digits(n_class=10, return_X_y=True, as_frame=False)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
plt.figure(figsize=(16, 6))
for i in range(10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(X[i,:].reshape([8,8]));

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=15062023, test_size=0.25)

# Многоклассовая логистическая регрессия

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

accuracy_score(y_test, model.predict(X_test))

In [None]:
model = LogisticRegression()

model.fit(X_train_sc, y_train)

accuracy_score(y_test, model.predict(X_test_sc))

# SVM

# Метод K-ближайших соседей

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
params = {'n_neighbors': [2, 3, 4, 5, 7, 10, 15]}

model = GridSearchCV(KNeighborsClassifier(), params, verbose=False)

model.fit(X_train, y_train)

print(model.best_params_)
accuracy_score(y_test, model.best_estimator_.predict(X_test))

In [None]:
params = {'n_neighbors': [2, 3, 4, 5, 7, 10, 15]}

model = GridSearchCV(KNeighborsClassifier(), params, verbose=False)

model.fit(X_train_sc, y_train)

print(model.best_params_)
accuracy_score(y_test, model.best_estimator_.predict(X_test_sc))

# Дерево решений

# Случайный лес

# Градиентный бустинг решающих деревьев

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
data_train = Pool(data=X_train, label=y_train)
data_test = Pool(data=X_test, label=y_test)

In [None]:
model = CatBoostClassifier(iterations=100,
                           early_stopping_rounds=10,
                           eval_metric='AUC')

In [None]:
model.fit(data_train, eval_set=data_test)

In [None]:
accuracy_score(y_test, model.predict(data_test))

In [None]:
shap_values.shape

In [None]:
shap_values = model.get_feature_importance(data_test, 'ShapValues')

In [None]:
shap_values.shape

In [None]:
shap_values = shap_values[:, :, :-1] # последний столбец -- среднее значение "функции" (оно одинаково для всех. это свободный член)

In [None]:
shap_values.shape

In [None]:

shap.summary_plot(shap_values[:, 3, :], X_test)

# окраски нет, потому что данные категориальные, а их кодирование в числа происходит внутри 