# Лабораторная работа №3

# Решающие деревья и ансамбли

In [1]:
from typing import Dict

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import StackingRegressor

from catboost import CatBoostRegressor
from catboost import CatBoostClassifier

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

ModuleNotFoundError: No module named 'catboost'

#### Универсальная функция для обучения и подбора параметров к моделям различного типа

In [6]:
def optimizer(X: pd.DataFrame, y: pd.DataFrame, estimator, params: Dict, type_of_model) -> None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = GridSearchCV(estimator, params, cv=3).fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    print("[+] Estimator:", estimator)
    print("[+] Best params:", model.best_params_)

    if type_of_model == "regression":
        print("[+] MAE:", mean_absolute_error(y_pred, y_test))
        print("[+] MSE:", mean_squared_error(y_pred, y_test))
        print("[+] MAPE:", mean_absolute_percentage_error(y_pred, y_test))
        print("[+] R2-Score:", r2_score(y_pred, y_test))

    elif type_of_model == "classification":
        print("[+] Accuracy:", accuracy_score(y_test, y_pred))
        print("[+] Precision:", precision_score(y_test, y_pred))
        print("[+] Recall:", recall_score(y_test, y_pred))
        print("[+] F1-Score:", f1_score(y_test, y_pred))
        print("[+] ROC-AUC Score:", roc_auc_score(y_test, y_pred))

## Применение деревьев решений для решения задач регрессии

#### Подготовка и обработка данных

In [3]:
regression_data = pd.read_csv("data/houses.csv")

regression_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
regression_data["date"] = [date[:8] for date in regression_data["date"]]

In [5]:
regression_data.shape, regression_data.dropna().shape

((21613, 21), (21613, 21))

In [6]:
X_regression = regression_data.drop(["price"], axis=1)
y_regression = regression_data["price"]

#### Обучение моделей и оптимизация их параметров

In [7]:
optimizer(X_regression, y_regression, DecisionTreeRegressor(), {"max_depth": range(1, 10)}, "regression")

[+] Estimator: DecisionTreeRegressor()
[+] Best params: {'max_depth': 9}
[+] MAE: 95833.52644984219
[+] MSE: 30821486551.28345
[+] MAPE: 0.16659603601583356
[+] R2-Score: 0.7557602570198915


In [8]:
optimizer(
    X_regression,
    y_regression,
    BaggingRegressor(),
    {
        "max_samples": range(1, 10)
    },
    "regression"
)

[+] Estimator: BaggingRegressor()
[+] Best params: {'max_samples': 9}
[+] MAE: 188795.87365823565
[+] MSE: 85572213046.49902
[+] MAPE: 0.32288369535915273
[+] R2-Score: -0.7361220339746759


In [9]:
optimizer(
    X_regression,
    y_regression,
    GradientBoostingRegressor(),
    {
        "loss": [
            "squared_error",
            "absolute_error",
            "huber",
            "quantile"
        ],
        "learning_rate": [0.01, 0.1, 1],
    },
    "regression"
)

[+] Estimator: GradientBoostingRegressor()
[+] Best params: {'learning_rate': 0.1, 'loss': 'squared_error'}
[+] MAE: 80384.18490718286
[+] MSE: 20104624958.350227
[+] MAPE: 0.13998582822621353
[+] R2-Score: 0.8336209634447188


In [10]:
optimizer(
    X_regression,
    y_regression,
    StackingRegressor(estimators=[("lr", LinearRegression())], verbose=False),
    {
        "cv": range(1, 10)
    },
    "regression"
)

3 fits failed out of a total of 27.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3.10/site-packages/sklearn/ensemble/_stacking.py", line 958, in fit
    return super().fit(X, y, sample_weight)
  File "/usr/lib/python3.10/site-packages/sklearn/ensemble/_stacking.py", line 190, in fit
    self._validate_params()
  File "/usr/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/lib/python3.10/site-packages/sklea

[+] Estimator: StackingRegressor(estimators=[('lr', LinearRegression())], verbose=False)
[+] Best params: {'cv': 5}
[+] MAE: 10879982.147748347
[+] MSE: 220433846138657.0
[+] MAPE: 1.2327912564866694
[+] R2-Score: 0.04088915694280748


In [11]:
optimizer(
    X_regression,
    y_regression,
    CatBoostRegressor(verbose=False),
    {
        "max_depth": range(1, 10)
    },
    "regression"
)

[+] Estimator: <catboost.core.CatBoostRegressor object at 0x7ffaa6d9b700>
[+] Best params: {'max_depth': 4}
[+] MAE: 67298.31135806702
[+] MSE: 15365355343.599192
[+] MAPE: 0.1188380300468967
[+] R2-Score: 0.8878922433313643


## Применение деревьев решений для задач классификации

#### Подготовка и обработка данных

In [2]:
classification_data = pd.read_csv("data/card_transdata.csv")

classification_data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [3]:
classification_data.shape, classification_data.dropna().shape

((1000000, 8), (1000000, 8))

In [4]:
X_classification = classification_data.drop(["fraud"], axis=1)
y_classification = classification_data["fraud"]

#### Обучение моделей и оптимизация их параметров

In [15]:
optimizer(X_classification, y_classification, DecisionTreeClassifier(), {"criterion": ["gini", "entropy"], "max_depth": range(1, 10)}, "classification")

[+] Estimator: DecisionTreeClassifier()
[+] Best params: {'criterion': 'entropy', 'max_depth': 8}
[+] Accuracy: 0.99999
[+] Precision: 0.9999617268830373
[+] Recall: 0.9999234566956255
[+] F1-Score: 0.9999425914231586
[+] ROC-AUC Score: 0.9999599026707605


In [16]:
optimizer(
    X_classification,
    y_classification,
    BaggingClassifier(),
    {
        "max_samples": range(1, 10)
    },
    "classification"
)



[+] Estimator: BaggingClassifier()
[+] Best params: {'max_samples': 7}
[+] Accuracy: 0.0023133867982726712


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
optimizer(
    X_classification,
    y_classification,
    GradientBoostingClassifier(),
    {
        "loss": [
            "log_loss",
            "deviance",
            "exponential",
        ],
        "learning_rate": [0.01, 0.1, 1],
    },
    "classification"
)



In [None]:
optimizer(
    X_classification,
    y_classification,
    StackingClassifier(estimators=[("lr", LogisticRegression())], verbose=False),
    {
        "cv": range(1, 10)
    },
    "classification"
)

In [None]:
optimizer(X_classification, y_classification, CatBoostClassifier(verbose=False), {"max_depth": range(1, 10)}, "classification")