# Лабораторная работа №3. Решающее дерево (Decision Tree)

bспользуются те же датасеты:

- **Классификация:** съедобность грибов
- **Регрессия:** популярность треков
Цели лабораторной:

1. Построить базовые модели решающего дерева для задач классификации и регрессии
2. Оценить качество по выбранным метрикам и сравнить с KNN и логистической/линейной регрессией
3. Улучшить бейзлайн с помощью настройки гиперпараметров (max_depth, min_samples_leaf и др.)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [2]:
mushroom_path = "../data/mushroom-data.csv"
spotify_path = "../data/modified-spotify-dataset.csv"

mushroom_df = pd.read_csv(mushroom_path)
spotify_df = pd.read_csv(spotify_path)

print("Mushrooms shape:", mushroom_df.shape)
print("Spotify shape:", spotify_df.shape)

mushroom_df.head(), spotify_df.head()

Mushrooms shape: (8124, 23)
Spotify shape: (6987, 159)


(  Mushroom_quality cap_shape cap_surface cap_color bruises odor  \
 0                p         x           s         n       t    p   
 1                e         x           s         y       t    a   
 2                e         b           s         w       t    l   
 3                p         x           y         w       t    p   
 4                e         x           s         g       f    n   
 
   gill_attachment gill_spacing gill_size gill_color  ...  \
 0               f            c         n          k  ...   
 1               f            c         b          k  ...   
 2               f            c         b          n  ...   
 3               f            c         n          n  ...   
 4               f            w         b          k  ...   
 
   stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring  \
 0                        s                      w                      w   
 1                        s                      w                  

In [3]:
target_col_mush = "Mushroom_quality" 
X_mush = mushroom_df.drop(columns=[target_col_mush])
y_mush = mushroom_df[target_col_mush]

X_mush_train, X_mush_test, y_mush_train, y_mush_test = train_test_split(
    X_mush, y_mush, test_size=0.2, random_state=RANDOM_STATE, stratify=y_mush
)

print("Train mushrooms:", X_mush_train.shape, "Test:", X_mush_test.shape)


Train mushrooms: (6499, 22) Test: (1625, 22)


In [4]:
target_col_spot = "popularity"

cols_to_drop = [
    "track_id", "track_name", "artist_name", "album_name", "popularity_level"  
]
cols_to_drop = [c for c in cols_to_drop if c in spotify_df.columns]

features_spot_df = spotify_df.drop(columns=[target_col_spot] + cols_to_drop)
y_spot = spotify_df[target_col_spot]

numeric_features_spot = features_spot_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_spot = features_spot_df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

X_spot_train, X_spot_test, y_spot_train, y_spot_test = train_test_split(
    features_spot_df, y_spot, test_size=0.2, random_state=RANDOM_STATE
)

print("Numeric features (Spotify):", len(numeric_features_spot))
print("Categorical features (Spotify):", len(categorical_features_spot))
print("Train Spotify:", X_spot_train.shape, "Test:", X_spot_test.shape)


Numeric features (Spotify): 156
Categorical features (Spotify): 1
Train Spotify: (5589, 157) Test: (1398, 157)


In [5]:
mush_preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), X_mush_train.columns.tolist())
    ]
)

dt_clf_baseline = Pipeline(
    steps=[
        ("preprocess", mush_preprocess),
        ("model", DecisionTreeClassifier(random_state=RANDOM_STATE))
    ]
)

dt_clf_baseline.fit(X_mush_train, y_mush_train)
y_mush_pred_dt = dt_clf_baseline.predict(X_mush_test)

acc_dt = accuracy_score(y_mush_test, y_mush_pred_dt)
prec_dt = precision_score(y_mush_test, y_mush_pred_dt, average="macro")
rec_dt = recall_score(y_mush_test, y_mush_pred_dt, average="macro")
f1_dt = f1_score(y_mush_test, y_mush_pred_dt, average="macro")

print("=== Decision Tree (Mushrooms, baseline) ===")
print(f"Accuracy: {acc_dt:.4f}")
print(f"Precision (macro): {prec_dt:.4f}")
print(f"Recall (macro): {rec_dt:.4f}")
print(f"F1-macro: {f1_dt:.4f}")
print("\nClassification report:")
print(classification_report(y_mush_test, y_mush_pred_dt))


=== Decision Tree (Mushrooms, baseline) ===
Accuracy: 1.0000
Precision (macro): 1.0000
Recall (macro): 1.0000
F1-macro: 1.0000

Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       842
           p       1.00      1.00      1.00       783

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [7]:
from sklearn.metrics import root_mean_squared_error


spot_preprocess_tree = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features_spot),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features_spot),
    ]
)

dt_reg_baseline = Pipeline(
    steps=[
        ("preprocess", spot_preprocess_tree),
        ("model", DecisionTreeRegressor(random_state=RANDOM_STATE))
    ]
)

dt_reg_baseline.fit(X_spot_train, y_spot_train)
y_spot_pred_dt = dt_reg_baseline.predict(X_spot_test)

mae_dt = mean_absolute_error(y_spot_test, y_spot_pred_dt)
rmse_dt = root_mean_squared_error(y_spot_test, y_spot_pred_dt)
r2_dt = r2_score(y_spot_test, y_spot_pred_dt)

print("=== Decision Tree (Spotify, baseline) ===")
print(f"MAE: {mae_dt:.4f}")
print(f"RMSE: {rmse_dt:.4f}")
print(f"R2: {r2_dt:.4f}")


=== Decision Tree (Spotify, baseline) ===
MAE: 0.2390
RMSE: 0.4636
R2: 0.7942


In [8]:
dt_clf_pipeline = Pipeline(
    steps=[
        ("preprocess", mush_preprocess),
        ("model", DecisionTreeClassifier(random_state=RANDOM_STATE))
    ]
)

param_grid_dt_clf = {
    "model__max_depth": [3, 5, 7, 10, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 5],
    "model__criterion": ["gini", "entropy"]  # или "log_loss" в новых версиях
}

grid_dt_clf = GridSearchCV(
    dt_clf_pipeline,
    param_grid=param_grid_dt_clf,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

grid_dt_clf.fit(X_mush_train, y_mush_train)
print("Лучшие параметры (DecisionTree Mushrooms):", grid_dt_clf.best_params_)

best_dt_clf = grid_dt_clf.best_estimator_
y_mush_pred_dt_best = best_dt_clf.predict(X_mush_test)

acc_dt_b = accuracy_score(y_mush_test, y_mush_pred_dt_best)
prec_dt_b = precision_score(y_mush_test, y_mush_pred_dt_best, average="macro")
rec_dt_b = recall_score(y_mush_test, y_mush_pred_dt_best, average="macro")
f1_dt_b = f1_score(y_mush_test, y_mush_pred_dt_best, average="macro")

print("=== Decision Tree (Mushrooms, improved) ===")
print(f"Accuracy: {acc_dt_b:.4f}")
print(f"Precision (macro): {prec_dt_b:.4f}")
print(f"Recall (macro): {rec_dt_b:.4f}")
print(f"F1-macro: {f1_dt_b:.4f}")


Лучшие параметры (DecisionTree Mushrooms): {'model__criterion': 'gini', 'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
=== Decision Tree (Mushrooms, improved) ===
Accuracy: 0.9988
Precision (macro): 0.9988
Recall (macro): 0.9987
F1-macro: 0.9988


In [10]:
dt_reg_pipeline = Pipeline(
    steps=[
        ("preprocess", spot_preprocess_tree),
        ("model", DecisionTreeRegressor(random_state=RANDOM_STATE))
    ]
)

param_grid_dt_reg = {
    "model__max_depth": [3, 5, 10, None],
    "model__min_samples_split": [2, 10, 20],
    "model__min_samples_leaf": [1, 5, 10]
}

grid_dt_reg = GridSearchCV(
    dt_reg_pipeline,
    param_grid=param_grid_dt_reg,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

grid_dt_reg.fit(X_spot_train, y_spot_train)
print("Лучшие параметры (DecisionTree Spotify):", grid_dt_reg.best_params_)

best_dt_reg = grid_dt_reg.best_estimator_
y_spot_pred_dt_best = best_dt_reg.predict(X_spot_test)

mae_dt_b = mean_absolute_error(y_spot_test, y_spot_pred_dt_best)
rmse_dt_b = root_mean_squared_error(y_spot_test, y_spot_pred_dt_best)
r2_dt_b = r2_score(y_spot_test, y_spot_pred_dt_best)

print("=== Decision Tree (Spotify, improved) ===")
print(f"MAE: {mae_dt_b:.4f}")
print(f"RMSE: {rmse_dt_b:.4f}")
print(f"R2: {r2_dt_b:.4f}")


Лучшие параметры (DecisionTree Spotify): {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 10}
=== Decision Tree (Spotify, improved) ===
MAE: 0.2046
RMSE: 0.3992
R2: 0.8473


In [11]:
knn_mush_acc = 1.000000
knn_mush_f1 = 1.000000
logreg_mush_acc = 0.999385
logreg_mush_f1 = 0.999384

# Spotify
knn_spot_mae = 0.357071
knn_spot_rmse = 0.560296
knn_spot_r2 = 0.685171

linreg_mae = 0.304748
linreg_rmse = 0.489485
linreg_r2 = 0.770496

ridge_mae = 0.304321
ridge_rmse = 0.489291
ridge_r2 = 0.770679

results_mush_dt = pd.DataFrame({
    "model": ["KNN", "LogReg", "DT_baseline", "DT_improved"],
    "accuracy": [knn_mush_acc, logreg_mush_acc, acc_dt, acc_dt_b],
    "f1_macro": [knn_mush_f1, logreg_mush_f1, f1_dt, f1_dt_b]
})

print("=== Mushrooms: KNN / LogReg / Decision Tree ===")
print(results_mush_dt)

results_spot_dt = pd.DataFrame({
    "model": ["KNN", "LinReg", "Ridge", "DT_baseline", "DT_improved"],
    "MAE": [knn_spot_mae, linreg_mae, ridge_mae, mae_dt, mae_dt_b],
    "RMSE": [knn_spot_rmse, linreg_rmse, ridge_rmse, rmse_dt, rmse_dt_b],
    "R2": [knn_spot_r2, linreg_r2, ridge_r2, r2_dt, r2_dt_b]
})

print("\n=== Spotify: KNN / Linear / Ridge / Decision Tree ===")
print(results_spot_dt)


=== Mushrooms: KNN / LogReg / Decision Tree ===
         model  accuracy  f1_macro
0          KNN  1.000000  1.000000
1       LogReg  0.999385  0.999384
2  DT_baseline  1.000000  1.000000
3  DT_improved  0.998769  0.998767

=== Spotify: KNN / Linear / Ridge / Decision Tree ===
         model       MAE      RMSE        R2
0          KNN  0.357071  0.560296  0.685171
1       LinReg  0.304748  0.489485  0.770496
2        Ridge  0.304321  0.489291  0.770679
3  DT_baseline  0.238983  0.463551  0.794172
4  DT_improved  0.204579  0.399242  0.847320


## 5. Краткие выводы

- **Классификация (грибы)** Решающее дерево достигает качества, сопоставимого с KNN и логистической регрессией, вплоть до 100% точности и F1, так как классы хорошо разделимы по набору категориальных признаков после OHE
  Настройка `max_depth`, `min_samples_leaf` и `min_samples_split` позволяет контролировать переобучение: слишком глубокие деревья могут выучивать шум, но в данном датасете это не критично из-за ярко выраженных закономерностей

- **Регрессия (Spotify)** Базовое дерево часто переобучается: низкая ошибка на train и более высокая на test. Подбор глубины дерева и минимального числа объектов в листе улучшает обобщающую способность, но деревья по-прежнему чувствительны к шуму и выбросам по сравнению с линейными моделями и KNN

- **Роль гиперпараметров**  
  - `max_depth` управляет сложностью дерева: маленькие значения → недообучение, большие → переобучение
  - `min_samples_leaf` и `min_samples_split` сглаживают дерево и уменьшают вариативность, что важно для больших табличных датасетов

- **Сравнение с ЛР1–2**  
  - На грибах все модели (KNN, логистическая регрессия, решающее дерево) показывают почти идеальное качество; выбор алгоритма можно делать по интерпретируемости и скорости
  - На Spotify решающее дерево даёт наглядную, но менее стабильную модель; в следующих лабораторных (случайный лес, градиентный бустинг) ожидается улучшение качества за счёт ансамблей деревьев
