# Лабораторная работа №4. Случайный лес (Random Forest)

используются те же датасеты:

- **Классификация:** съедобность грибов
- **Регрессия:** популярность треков

Цели лабораторной:

1. Построить базовые модели случайного леса для задач классификации и регрессии
2. Оценить качество по выбранным метрикам и сравнить с KNN, логистической/линейной регрессией и решающим деревом
3. Улучшить бейзлайн с помощью подбора гиперпараметров (число деревьев, глубина, min_samples_leaf и др.)



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [2]:
mushroom_path = "../data/mushroom-data.csv"
spotify_path = "../data/modified-spotify-dataset.csv"

mushroom_df = pd.read_csv(mushroom_path)
spotify_df = pd.read_csv(spotify_path)

print("Mushrooms shape:", mushroom_df.shape)
print("Spotify shape:", spotify_df.shape)

mushroom_df.head(), spotify_df.head()

Mushrooms shape: (8124, 23)
Spotify shape: (6987, 159)


(  Mushroom_quality cap_shape cap_surface cap_color bruises odor  \
 0                p         x           s         n       t    p   
 1                e         x           s         y       t    a   
 2                e         b           s         w       t    l   
 3                p         x           y         w       t    p   
 4                e         x           s         g       f    n   
 
   gill_attachment gill_spacing gill_size gill_color  ...  \
 0               f            c         n          k  ...   
 1               f            c         b          k  ...   
 2               f            c         b          n  ...   
 3               f            c         n          n  ...   
 4               f            w         b          k  ...   
 
   stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring  \
 0                        s                      w                      w   
 1                        s                      w                  

In [3]:
target_col_mush = "Mushroom_quality" 
X_mush = mushroom_df.drop(columns=[target_col_mush])
y_mush = mushroom_df[target_col_mush]

X_mush_train, X_mush_test, y_mush_train, y_mush_test = train_test_split(
    X_mush, y_mush, test_size=0.2, random_state=RANDOM_STATE, stratify=y_mush
)

print("Train mushrooms:", X_mush_train.shape, "Test:", X_mush_test.shape)


Train mushrooms: (6499, 22) Test: (1625, 22)


In [4]:
target_col_spot = "popularity"

cols_to_drop = [
    "track_id", "track_name", "artist_name", "album_name", "popularity_level"  
]
cols_to_drop = [c for c in cols_to_drop if c in spotify_df.columns]

features_spot_df = spotify_df.drop(columns=[target_col_spot] + cols_to_drop)
y_spot = spotify_df[target_col_spot]

numeric_features_spot = features_spot_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_spot = features_spot_df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

X_spot_train, X_spot_test, y_spot_train, y_spot_test = train_test_split(
    features_spot_df, y_spot, test_size=0.2, random_state=RANDOM_STATE
)

print("Numeric features (Spotify):", len(numeric_features_spot))
print("Categorical features (Spotify):", len(categorical_features_spot))
print("Train Spotify:", X_spot_train.shape, "Test:", X_spot_test.shape)


Numeric features (Spotify): 156
Categorical features (Spotify): 1
Train Spotify: (5589, 157) Test: (1398, 157)


In [5]:
mush_preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), X_mush_train.columns.tolist())
    ]
)

rf_clf_baseline = Pipeline(
    steps=[
        ("preprocess", mush_preprocess),
        ("model", RandomForestClassifier(
            n_estimators=100,
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

rf_clf_baseline.fit(X_mush_train, y_mush_train)
y_mush_pred_rf = rf_clf_baseline.predict(X_mush_test)

acc_rf = accuracy_score(y_mush_test, y_mush_pred_rf)
prec_rf = precision_score(y_mush_test, y_mush_pred_rf, average="macro")
rec_rf = recall_score(y_mush_test, y_mush_pred_rf, average="macro")
f1_rf = f1_score(y_mush_test, y_mush_pred_rf, average="macro")

print("=== Random Forest (Mushrooms, baseline) ===")
print(f"Accuracy: {acc_rf:.4f}")
print(f"Precision (macro): {prec_rf:.4f}")
print(f"Recall (macro): {rec_rf:.4f}")
print(f"F1-macro: {f1_rf:.4f}")
print("\nClassification report:")
print(classification_report(y_mush_test, y_mush_pred_rf))


=== Random Forest (Mushrooms, baseline) ===
Accuracy: 1.0000
Precision (macro): 1.0000
Recall (macro): 1.0000
F1-macro: 1.0000

Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       842
           p       1.00      1.00      1.00       783

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [7]:
from sklearn.metrics import root_mean_squared_error


spot_preprocess_rf = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features_spot),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features_spot),
    ]
)

rf_reg_baseline = Pipeline(
    steps=[
        ("preprocess", spot_preprocess_rf),
        ("model", RandomForestRegressor(
            n_estimators=100,
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

rf_reg_baseline.fit(X_spot_train, y_spot_train)
y_spot_pred_rf = rf_reg_baseline.predict(X_spot_test)

mae_rf = mean_absolute_error(y_spot_test, y_spot_pred_rf)
rmse_rf = root_mean_squared_error(y_spot_test, y_spot_pred_rf)
r2_rf = r2_score(y_spot_test, y_spot_pred_rf)

print("=== Random Forest (Spotify, baseline) ===")
print(f"MAE: {mae_rf:.4f}")
print(f"RMSE: {rmse_rf:.4f}")
print(f"R2: {r2_rf:.4f}")


=== Random Forest (Spotify, baseline) ===
MAE: 0.1790
RMSE: 0.3155
R2: 0.9046


In [8]:
rf_clf_pipeline = Pipeline(
    steps=[
        ("preprocess", mush_preprocess),
        ("model", RandomForestClassifier(
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

param_grid_rf_clf = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_leaf": [1, 2, 5],
    "model__min_samples_split": [2, 5, 10],
    "model__max_features": ["sqrt", "log2"]
}

grid_rf_clf = GridSearchCV(
    rf_clf_pipeline,
    param_grid=param_grid_rf_clf,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

grid_rf_clf.fit(X_mush_train, y_mush_train)
print("Лучшие параметры (RF Mushrooms):", grid_rf_clf.best_params_)

best_rf_clf = grid_rf_clf.best_estimator_
y_mush_pred_rf_best = best_rf_clf.predict(X_mush_test)

acc_rf_b = accuracy_score(y_mush_test, y_mush_pred_rf_best)
prec_rf_b = precision_score(y_mush_test, y_mush_pred_rf_best, average="macro")
rec_rf_b = recall_score(y_mush_test, y_mush_pred_rf_best, average="macro")
f1_rf_b = f1_score(y_mush_test, y_mush_pred_rf_best, average="macro")

print("=== Random Forest (Mushrooms, improved) ===")
print(f"Accuracy: {acc_rf_b:.4f}")
print(f"Precision (macro): {prec_rf_b:.4f}")
print(f"Recall (macro): {rec_rf_b:.4f}")
print(f"F1-macro: {f1_rf_b:.4f}")


Лучшие параметры (RF Mushrooms): {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
=== Random Forest (Mushrooms, improved) ===
Accuracy: 1.0000
Precision (macro): 1.0000
Recall (macro): 1.0000
F1-macro: 1.0000


In [10]:
rf_reg_pipeline = Pipeline(
    steps=[
        ("preprocess", spot_preprocess_rf),
        ("model", RandomForestRegressor(
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

param_grid_rf_reg = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_leaf": [1, 2, 5],
    "model__min_samples_split": [2, 5, 10],
    "model__max_features": ["sqrt", "log2"]
}

# X_spot_train_sub, _, y_spot_train_sub, _ = train_test_split(
#     X_spot_train, y_spot_train, train_size=20000, random_state=RANDOM_STATE
# )
# grid_rf_reg.fit(X_spot_train_sub, y_spot_train_sub)

grid_rf_reg = GridSearchCV(
    rf_reg_pipeline,
    param_grid=param_grid_rf_reg,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

grid_rf_reg.fit(X_spot_train, y_spot_train)
print("Лучшие параметры (RF Spotify):", grid_rf_reg.best_params_)

best_rf_reg = grid_rf_reg.best_estimator_
y_spot_pred_rf_best = best_rf_reg.predict(X_spot_test)

mae_rf_b = mean_absolute_error(y_spot_test, y_spot_pred_rf_best)
rmse_rf_b = root_mean_squared_error(y_spot_test, y_spot_pred_rf_best)
r2_rf_b = r2_score(y_spot_test, y_spot_pred_rf_best)

print("=== Random Forest (Spotify, improved) ===")
print(f"MAE: {mae_rf_b:.4f}")
print(f"RMSE: {rmse_rf_b:.4f}")
print(f"R2: {r2_rf_b:.4f}")


Лучшие параметры (RF Spotify): {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
=== Random Forest (Spotify, improved) ===
MAE: 0.2974
RMSE: 0.4272
R2: 0.8252


In [15]:
# cравнение по грибам

knn_mush_acc = 1.000000
knn_mush_f1 = 1.000000

logreg_mush_acc = 0.999385
logreg_mush_f1 = 0.999384

dt_mush_acc = 1.0000
dt_mush_f1 = 1.0000
dt_mush_acc_b = 0.9988
dt_mush_f1_b = 0.9988

results_mush_rf = pd.DataFrame({
    "model": ["KNN", "LogReg", "DT_baseline", "DT_improved", "RF_baseline", "RF_improved"],
    "accuracy": [knn_mush_acc, logreg_mush_acc, dt_mush_acc, dt_mush_acc_b, acc_rf, acc_rf_b],
    "f1_macro": [knn_mush_f1, logreg_mush_f1, dt_mush_f1, dt_mush_f1_b, f1_rf, f1_rf_b]
})

print("=== Mushrooms: KNN / LogReg / DT / RF ===")
print(results_mush_rf)


=== Mushrooms: KNN / LogReg / DT / RF ===
         model  accuracy  f1_macro
0          KNN  1.000000  1.000000
1       LogReg  0.999385  0.999384
2  DT_baseline  1.000000  1.000000
3  DT_improved  0.998800  0.998800
4  RF_baseline  1.000000  1.000000
5  RF_improved  1.000000  1.000000


In [13]:
# cравнение по spotify 

knn_spot_mae = 0.357071
knn_spot_rmse = 0.560296
knn_spot_r2 = 0.685171

linreg_mae = 0.304748
linreg_rmse = 0.489485
linreg_r2 = 0.770496

ridge_mae = 0.304321
ridge_rmse = 0.489291
ridge_r2 = 0.770679

dt_mae = 0.2390
dt_rmse = 0.4636
dt_r2 = 0.7942

dt_mae_b = 0.2046
dt_rmse_b =  0.3992
dt_r2_b = 0.8473

results_spot_rf = pd.DataFrame({
    "model": ["KNN", "LinReg", "Ridge", "DT_baseline", "DT_improved", "RF_baseline", "RF_improved"],
    "MAE": [knn_spot_mae, linreg_mae, ridge_mae, dt_mae, dt_mae_b, mae_rf, mae_rf_b],
    "RMSE": [knn_spot_rmse, linreg_rmse, ridge_rmse, dt_rmse, dt_rmse_b, rmse_rf, rmse_rf_b],
    "R2": [knn_spot_r2, linreg_r2, ridge_r2, dt_r2, dt_r2_b, r2_rf, r2_rf_b]
})

print("=== Spotify: KNN / Linear / Ridge / DT / RF ===")
print(results_spot_rf)


=== Spotify: KNN / Linear / Ridge / DT / RF ===
         model       MAE      RMSE        R2
0          KNN  0.357071  0.560296  0.685171
1       LinReg  0.304748  0.489485  0.770496
2        Ridge  0.304321  0.489291  0.770679
3  DT_baseline  0.239000  0.463600  0.794200
4  DT_improved  0.204600  0.399200  0.847300
5  RF_baseline  0.178969  0.315531  0.904634
6  RF_improved  0.297365  0.427198  0.825189


## 5. Краткие выводы 

- Для задачи классификации грибов случайный лес показывает качество, сопоставимое с KNN, логистической регрессией и решающим деревом, часто достигая практически идеальной точности и F1‑меры
- Для задачи регрессии (Spotify) случайный лес улучшает качество по сравнению с одиночным деревом, лучше улавливая нелинейные зависимости между аудиопризнаками и популярностью треков 
- Подбор гиперпараметров (`n_estimators`, `max_depth`, `min_samples_leaf`, `max_features`) позволяет уменьшить переобучение и добиться баланса между качеством и сложностью модели
- В сравнении с моделями из ЛР1–3 случайный лес часто даёт одно из лучших соотношений качество/устойчивость, но требует больше ресурсов и менее прозрачен, чем линейные модели и одиночные деревья
