In [33]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score)
from sklearn.metrics import (
    mean_absolute_error, mean_absolute_percentage_error,
    root_mean_squared_error)

import re
from typing import Literal
from joblib import Parallel, delayed
from itertools import product

# Датасеты


In [17]:
classification_df = pd.read_csv("data/consumer_electronics_sales_data.csv")
classification_df

Unnamed: 0,ProductID,ProductCategory,ProductBrand,ProductPrice,CustomerAge,CustomerGender,PurchaseFrequency,CustomerSatisfaction,PurchaseIntent
0,5874,Smartphones,Other Brands,312.949668,18,0,2,1,0
1,5875,Smart Watches,Samsung,980.389404,35,1,7,2,1
2,5876,Tablets,Samsung,2606.718293,63,0,1,5,1
3,5877,Smartphones,Samsung,870.395450,63,1,10,3,1
4,5878,Tablets,Sony,1798.955875,57,0,17,3,0
...,...,...,...,...,...,...,...,...,...
8995,14869,Smart Watches,Samsung,1041.149163,36,1,16,4,0
8996,14870,Smartphones,Samsung,1485.694311,57,0,5,1,1
8997,14871,Headphones,Samsung,2887.369597,28,0,18,4,0
8998,14872,Tablets,HP,1490.453964,38,0,4,2,1


In [18]:
regression_df = pd.read_csv("data/laptop_price.csv", encoding='latin1')
regression_df

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


# Baseline

## Классификация

### Создание baseline

Используется Случайный Лес для задачи классификации

Разделение датасета на `train` и `test`

In [4]:
cX = classification_df.drop(columns=["ProductID", "PurchaseIntent"])
cy = classification_df["PurchaseIntent"]

cX_train, cX_test, cy_train, cy_test = train_test_split(
    cX, cy, test_size=0.2, random_state=42
)

Препроцессинг данных

In [5]:
cat_cols = ["ProductCategory", "ProductBrand"]
num_cols = ["ProductPrice",
            "CustomerAge",
            "PurchaseFrequency",
            "CustomerSatisfaction"]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
])

cX_train_prep = preprocessor.fit_transform(cX_train)
cX_test_prep = preprocessor.transform(cX_test)

Создание и обучение модели

In [6]:
model = RandomForestClassifier()
model.fit(cX_train_prep, cy_train)
cy_pred = model.predict(cX_test_prep)

Оценка качества модели

In [7]:
print(f"""
F1 Score:   {f1_score(cy_test, cy_pred)}
==============================
Accuracy:   {accuracy_score(cy_test, cy_pred)}
Precision:  {precision_score(cy_test, cy_pred)}
Recall:     {recall_score(cy_test, cy_pred)}
""")


F1 Score:   0.717491984324902
Accuracy:   0.5594444444444444
Precision:  0.5594444444444444
Recall:     1.0



### Улучшение baseline

Не смотря на то, что случайный лес
работает корректно без стандартизации признаков,
её использование может дать более точный результат.


In [8]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols),
])

cX_train_scaled = preprocessor.fit_transform(cX_train)
cX_test_scaled = preprocessor.transform(cX_test)

Подбор гиперпараметров с помощью `GridSearch`

In [12]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

params = {
    "classifier__n_estimators": [150, 200],
    "classifier__criterion": ['gini', 'entropy'],
    "classifier__max_depth": [3, 5, 10],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
}


grid_search = GridSearchCV(
    pipeline,
    params,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=10,
    return_train_score=True,
)
grid_search.fit(cX_train, cy_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'classifier__criterion': ['gini', 'entropy'], 'classifier__max_depth': [3, 5, ...], 'classifier__min_samples_leaf': [1, 2, ...], 'classifier__min_samples_split': [2, 5, ...], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,10
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,150
,criterion,'gini'
,max_depth,3
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший F1:", grid_search.best_score_)

# Лучший F1: 0.7912488774013982

Лучшие параметры: {'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 150}
Лучший F1: 0.7912488774013982


In [14]:
cy_pred = grid_search.best_estimator_.predict(cX_test)
print(f"""
F1 Score:   {f1_score(cy_test, cy_pred)}
==============================
Accuracy:   {accuracy_score(cy_test, cy_pred)}
Precision:  {precision_score(cy_test, cy_pred)}
Recall:     {recall_score(cy_test, cy_pred)}
""")


F1 Score:   0.7944
Accuracy:   0.7144444444444444
Precision:  0.6651038178164769
Recall:     0.9860973187686196



- F1 Score в обычном baseline: **0.717491984324902**
- F1 Score в улучшенном baseline: **0.7944**

За счёт препроцессинга данных и подбора гиперпараметров удалось повысить точность модели.

Улучшенный baseline для случайного леса лучше чем для предыдущих моделей, максимальный F1 Score которых: *0.7870485678704857*.

## Регрессия

### Создание baseline

Используется *Случайный лес* для задачи регрессии

Повторим преобразование колонок из первой лабораторной работы.

In [19]:
def extract_resolution(value):
    match = re.search(r'(\d{3,4})x(\d{3,4})', str(value))
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

def parse_cpu(cpu):
    cpu = str(cpu)

    if 'Intel' in cpu:
        brand = 'Intel'
    elif 'AMD' in cpu:
        brand = 'AMD'
    elif 'Samsung' in cpu:
        brand = 'Samsung'
    else:
        brand = 'Other'

    if 'Core i3' in cpu:
        series = 'i3'
    elif 'Core i5' in cpu:
        series = 'i5'
    elif 'Core i7' in cpu:
        series = 'i7'
    elif 'Celeron' in cpu:
        series = 'Celeron'
    elif 'Pentium' in cpu:
        series = 'Pentium'
    elif 'Atom' in cpu:
        series = 'Atom'
    elif 'Xeon' in cpu:
        series = 'Xeon'
    elif 'M' in cpu and 'Core' in cpu:
        series = 'Core M'
    elif 'Ryzen' in cpu:
        series = 'Ryzen'
    elif 'A9-Series' in cpu:
        series = 'A9-Series'
    elif 'A6-Series' in cpu:
        series = 'A6-Series'
    elif 'E-Series' in cpu:
        series = 'E-Series'
    elif 'A12-Series' in cpu:
        series = 'A12-Series'
    elif 'A10-Series' in cpu:
        series = 'A10-Series'
    elif 'A8-Series' in cpu:
        series = 'A8-Series'
    elif 'FX' in cpu:
        series = 'FX'
    else:
        series = 'Other'

    freq_match = re.search(r'(\d+\.\d+)GHz', cpu)
    freq = float(freq_match.group(1)) if freq_match else None

    return pd.Series([brand, series, freq])

def parse_memory(memory_str):
    memory_str = memory_str.replace('TB', '000GB')

    parts = re.findall(r'(\d+)\s*GB\s*([A-Za-z\s]*)', memory_str)

    total_gb = 0
    types = []

    for size, mtype in parts:
        total_gb += int(size)
        mtype_clean = (
            mtype.strip()
            .replace('Storage', '')
            .replace('Flash', 'Flash')
            .replace('Hybrid', 'Hybrid')
            .replace('SSD', 'SSD')
            .replace('HDD', 'HDD')
        )
        if mtype_clean:
            types.append(mtype_clean.strip())

    main_type = types[0] if types else None

    return pd.Series([total_gb, main_type])

regression_df['Touchscreen'] = \
    regression_df['ScreenResolution'].str.contains('Touchscreen')
regression_df['IPS'] = \
    regression_df['ScreenResolution'].str.contains('IPS')
regression_df['RetinaDisplay'] = \
    regression_df['ScreenResolution'].str.contains('Retina')

regression_df['ResX'], regression_df['ResY'] = zip(
    *regression_df['ScreenResolution'].map(extract_resolution))
regression_df['TotalPixels'] = regression_df['ResX'] * regression_df['ResY']
regression_df.drop(columns=['ScreenResolution', 'ResX', 'ResY'], inplace=True)

regression_df[['CpuBrand', 'CpuSeries', 'CpuFreqGHz']] = \
    regression_df['Cpu'].apply(parse_cpu)
regression_df.drop(columns='Cpu', inplace=True)
regression_df['CpuFreqGHz'] = regression_df['CpuFreqGHz'].fillna(
    regression_df['CpuFreqGHz'].mean())

regression_df['Ram'] = regression_df['Ram'].str.replace('GB', '', regex=False).astype(int)

regression_df[['Memory_Storage', 'Memory_Type']] = \
    regression_df['Memory'].apply(parse_memory)
regression_df.drop(columns=['Memory'], inplace=True)

regression_df['Weight'] = \
    regression_df['Weight'].str.replace('kg', '', regex=False).astype(float)

regression_df['GpuBrand'] = regression_df['Gpu'].str.split().str[0]
regression_df.drop(columns=['Gpu'], inplace=True)

Разделение датасета на `train` и `test`

In [None]:
rX = regression_df.drop(columns=["laptop_ID", "Product", "Price_euros"])
ry = regression_df["Price_euros"]

rX_train, rX_test, ry_train, ry_test = train_test_split(
    rX, ry, test_size=0.2, random_state=42
)

Препроцессинг данных

In [22]:
cat_cols = ["Company", "TypeName", "OpSys", "CpuBrand",
            "CpuSeries", "Memory_Type", "GpuBrand"]
num_cols = ["Inches", "Ram", "Weight", "TotalPixels",
            "CpuFreqGHz", "Memory_Storage"]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
])

rX_train_prep = preprocessor.fit_transform(rX_train)
rX_test_prep = preprocessor.transform(rX_test)

Создание и обучение модели

In [23]:
model = RandomForestRegressor()
model.fit(rX_train_prep, ry_train)
ry_pred = model.predict(rX_test_prep)

Оценка качества модели

In [24]:
print(f"""
MAE:   {mean_absolute_error(ry_test, ry_pred)}
==============================
MAPE:   {mean_absolute_percentage_error(ry_test, ry_pred)}
RMSE:  {root_mean_squared_error(ry_test, ry_pred)}
""")


MAE:   272.38087138993944
MAPE:   0.2222978339489446
RMSE:  442.8175366116527



### Улучшение baseline

Используем стандартизацию данных для улучшения результата.

In [25]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', MinMaxScaler(), num_cols),
])

rX_train_scaled = preprocessor.fit_transform(rX_train)
rX_test_scaled = preprocessor.transform(rX_test)

Подбор гиперпараметров с помощью `GridSearch`

In [29]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

params = {
    "regressor__n_estimators": [200, 250],
    "regressor__criterion": ['squared_error', 'absolute_error'],
    "regressor__max_depth": [10, 20, None],
    "regressor__min_samples_split": [2, 5],
    "regressor__min_samples_leaf": [1, 2],
}

grid_search = GridSearchCV(
    pipeline,
    params,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=True,
)
grid_search.fit(rX_train, ry_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'regressor__criterion': ['squared_error', 'absolute_error'], 'regressor__max_depth': [10, 20, ...], 'regressor__min_samples_leaf': [1, 2], 'regressor__min_samples_split': [2, 5], ...}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,200
,criterion,'absolute_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший MAE:", -grid_search.best_score_)

# Лучший MAE: 179.9903906257131

Лучшие параметры: {'regressor__criterion': 'absolute_error', 'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Лучший MAE: 179.14952343346545


Оценка качества модели:

In [31]:
ry_pred = grid_search.best_estimator_.predict(rX_test)
print(f"""
MAE:   {mean_absolute_error(ry_test, ry_pred)}
==============================
MAPE:   {mean_absolute_percentage_error(ry_test, ry_pred)}
RMSE:  {root_mean_squared_error(ry_test, ry_pred)}
""")


MAE:   195.67773913043473
MAPE:   0.17439558670472302
RMSE:  307.62115824859376



- MAE в обычном baseline: **272.38087138993944**
- MAE в улучшенном baseline: **195.67773913043473**

За счёт препроцессинга данных и подбора гиперпараметров удалось повысить точность модели.

В задаче регрессии случайный лес проявил себя немного хуже, чем KNN, но лучше, чем линейная регрессия и дерево решений.

# Имплементация Случайного Леса

Случайный лес - это ансамбль из большого количества деревьев решений.

1. Для каждого дерева сначала случайным выбирается n строк датасета (возможно с копиями)
2. Далее для каждого дерева выбирается не весь набор признаков, а случайная его часть (например, `sqrt` или `log2` от общего количества)
3. Каждое дерево обучается на своих данных. Итоговое предсказание получается по следующему принципу:
    - для классификации - голосование
    - для регрессии - среднее по деревьям


Для реализации случайного леса будет использоваться написанная ранее собственная реализация дерева решений

In [34]:
class DecisionTree:
    EPS = 1e-15

    class Node:
        def __init__(self, *, feature=None, threshold=None,
                    left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

        def is_leaf(self):
            return self.value is not None

    def __init__(self, criterion: Literal['gini', 'entropy', 'mae', 'mse'],
                 max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    def fit(self, X, y):
        if hasattr(X, "toarray"):
            X = X.toarray()
        self.tree_ = self.__build_tree(X, y)

    def predict(self, X, threshold=0.5):
        return np.array([self.__predict_one(x, self.tree_) for x in X])

    def __build_tree(self, X, y, depth=0):
        node = self.Node()

        if ((self.max_depth is not None and depth >= self.max_depth) or
            (len(y) < self.min_samples_split) or
            (len(np.unique(y)) == 1)):

            node.value = self.__leaf_value(y)
            return node

        feature, threshold = self.__best_split(X, y)
        if feature is None:
            node.value = self.__leaf_value(y)
            return node

        left = X[:, feature] <= threshold
        right = ~left

        node.feature = feature
        node.threshold = threshold
        node.left = self.__build_tree(X[left], y[left], depth + 1)
        node.right = self.__build_tree(X[right], y[right], depth + 1)
        return node


    def __best_split(self, X, y):
        rows, cols = X.shape
        best_feat, best_thresh = None, None
        best_impurity = float('inf')

        for feature in range(cols):
            values = np.unique(X[:, feature])
            for threshold in values:
                left = X[:, feature] <= threshold
                right = ~left
                if (left.sum() < self.min_samples_leaf or
                    right.sum() < self.min_samples_leaf):
                    continue
                y_left, y_right = y[left], y[right]
                impurity = (
                    (len(y_left) / rows) * self.__impurity(y_left) +
                    (len(y_right) / rows) * self.__impurity(y_right))
                if impurity < best_impurity:
                    best_impurity = impurity
                    best_feat = feature
                    best_thresh = threshold
        return best_feat, best_thresh

    def __leaf_value(self, y):
        if self.criterion in ['gini', 'entropy']:
            values, counts = np.unique(y, return_counts=True)
            return values[np.argmax(counts)]
        if self.criterion == 'mae':
            return np.median(y)
        return np.mean(y)

    def __predict_one(self, x, node):
        if hasattr(x, "toarray"):
            x = x.toarray().ravel()

        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self.__predict_one(x, node.left)
        return self.__predict_one(x, node.right)

    def __impurity(self, y):
        match self.criterion:
            case 'gini':
                return self.__gini(y)
            case 'entropy':
                return self.__entropy(y)
            case 'mae':
                return self.__mae(y)
            case 'mse':
                return self.__mse(y)

    def __gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return 1 - np.sum(p**2)

    def __entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return -np.sum(p * np.log2(p + self.EPS))

    def __mae(self, y):
        m = np.median(y)
        return np.mean(np.abs(y - m))

    def __mse(self, y):
        m = np.mean(y)
        return np.mean((y - m)**2)

In [36]:
class RandomForest:
    def __init__(self, criterion: Literal['gini', 'entropy', 'mae', 'mse'],
                 n_estimators=100, max_depth=None, min_samples_leaf=1,
                 min_samples_split=2, max_features='sqrt', n_jobs=10):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.n_jobs = n_jobs

    def fit(self, X, y):
        if hasattr(X, "toarray"):
            X = X.toarray()
        y = np.asarray(y)

        results = Parallel(n_jobs=self.n_jobs)(
            delayed(self._build_one)(X, y)
            for _ in range(self.n_estimators)
        )
        self.trees, self.features = zip(*results)

    def predict(self, X):
        if hasattr(X, "toarray"):
            X = X.toarray()
        all_preds = []
        for tree, feature in zip(self.trees, self.features):
            pred = tree.predict(X[:, feature])
            all_preds.append(pred.reshape(-1, 1))

        all_preds = np.hstack(all_preds)

        if self.criterion in ['gini', 'entropy']:
            preds = []
            for row in all_preds:
                values, counts = np.unique(row, return_counts=True)
                preds.append(values[np.argmax(counts)])
            return np.array(preds)

        return np.mean(all_preds, axis=1)

    def _build_one(self, X, y):
        rows, cols = X.shape

        idx = np.random.choice(rows, size=rows, replace=True)
        X_boot = X[idx]
        y_boot = y[idx]

        features = self.__select_features(cols)

        tree = DecisionTree(self.criterion, self.max_depth,
                            self.min_samples_split, self.min_samples_leaf)
        tree.fit(X_boot[:, features], y_boot)

        return tree, features

    def __select_features(self, n_features):
        if self.max_features == 'sqrt':
            k = int(np.sqrt(n_features))
        elif self.max_features == 'log2':
            k = int(np.log2(n_features))
        elif isinstance(self.max_features, int):
            k = self.max_features
        elif isinstance(self.max_features, float):
            k = int(self.max_features * n_features)
        else:
            k = n_features

        k = max(1, min(k, n_features))
        return np.random.choice(n_features, size=k, replace=False)


### Испытание `RandomForest` в задаче классификации

In [37]:
my_classifier = RandomForest(criterion='gini')
my_classifier.fit(cX_train_prep, cy_train)
cy_pred = my_classifier.predict(cX_test_prep)
f1 = f1_score(cy_test, cy_pred)
print(f"F1 score for self-implemented RandomForest: {f1}")

F1 score for self-implemented RandomForest: 0.717491984324902


Результат получился почти такой же, что и в обычном baseline.

Воспользуемся подбором гиперпараметров, как в улучшенном baseline

In [39]:
def forest_grid_search(param_grid, X, y, scorer, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    best_score = None
    best_params = None

    keys = list(param_grid.keys())
    for values in product(*param_grid.values()):
        params = dict(zip(keys, values))
        fold_scores = []
        for train_idx, val_idx in kf.split(X):
            X_train = X.iloc[train_idx] if hasattr(X, "iloc") else X[train_idx]
            X_val   = X.iloc[val_idx]   if hasattr(X, "iloc") else X[val_idx]
            y_train = y.iloc[train_idx] if hasattr(y, "iloc") else y[train_idx]
            y_val   = y.iloc[val_idx]   if hasattr(y, "iloc") else y[val_idx]

            model = RandomForest(**params)
            model.fit(X_train, y_train)
            preds = model.predict(X_val)

            fold_scores.append(scorer(y_val, preds))

        mean_score = np.mean(fold_scores)
        if (best_score is None) or (mean_score > best_score):
            best_score = mean_score
            best_params = params

    return {
        "best_params": best_params,
        "best_score": best_score,
    }

In [40]:
params = {
    'criterion': ["gini", "entropy"],
    'n_estimators': [150],
    'max_depth': [3, 5],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
}

forest_grid_search(params, cX_train_scaled, cy_train, f1_score)

{'best_params': {'criterion': 'gini',
  'n_estimators': 150,
  'max_depth': 5,
  'min_samples_split': 10,
  'min_samples_leaf': 2},
 'best_score': np.float64(0.7384516381999594)}

In [None]:
my_classifier = RandomForest('gini', n_estimators=150,
                             max_depth=5,
                             min_samples_leaf=2, min_samples_split=10)
my_classifier.fit(cX_train_scaled, cy_train)
cy_pred = my_classifier.predict(cX_test_scaled)
print(f"""
F1 Score:   {f1_score(cy_test, cy_pred)}
==============================
Accuracy:   {accuracy_score(cy_test, cy_pred)}
Precision:  {precision_score(cy_test, cy_pred)}
Recall:     {recall_score(cy_test, cy_pred)}
""")


F1 Score:   0.7190289182434845
Accuracy:   0.5627777777777778
Precision:  0.5613154960981048
Recall:     1.0



Результат получился хуже, чем в улучшенном baseline

### Испытание `RandomForest` в задаче регрессии

In [46]:
my_regressor = RandomForest('mae')
my_regressor.fit(rX_train_prep, ry_train)
ry_pred = my_regressor.predict(rX_test_prep)
mae = mean_absolute_error(ry_test, ry_pred)
print(f"MAE for self-implemented RandomForest: {mae}")

MAE for self-implemented RandomForest: 406.33191163682864


Подбор гиперпараметров

In [48]:
params = {
    'criterion': ["mae"],
    'n_estimators': [200],
    'max_depth': [3, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
}

forest_grid_search(params, rX_train_scaled, ry_train, lambda t, p: -mean_absolute_error(t, p))

{'best_params': {'criterion': 'mae',
  'n_estimators': 200,
  'max_depth': 10,
  'min_samples_split': 2,
  'min_samples_leaf': 1},
 'best_score': np.float64(-299.72734593181406)}

In [50]:
my_regressor = RandomForest('mae', n_estimators=200,
                            max_depth=10,
                            min_samples_leaf=1, min_samples_split=2)
my_regressor.fit(rX_train_scaled, ry_train)
ry_pred = my_regressor.predict(rX_test_scaled)
print(f"""
MAE:   {mean_absolute_error(ry_test, ry_pred)}
==============================
MAPE:   {mean_absolute_percentage_error(ry_test, ry_pred)}
RMSE:  {root_mean_squared_error(ry_test, ry_pred)}
""")


MAE:   315.471780370844
MAPE:   0.3073578024522283
RMSE:  494.3543133286861



Результат получился хуже, чем в улучшенном baseline.

# Вывод

В ходе выполнения лабораторной работы был исследован алгоритм случайного леса.
Для задач классификации и регрессии были созданы обычный и улучшенный baseline.

Случайный лес показал лучший результат в задаче классификации. В задаче регрессии немного отстаёт от KNN, хотя обогнало линейную регрессию.

Собственная реализация оказалась хуже библиотечной в точности. Вероятно, если расширить диапазон параметров при подборе, то можно достичь лучших результатов, однако это займёт чрезмерно много времени.