In [1]:
from datetime import datetime as time

import numpy as np
import pandas as pd

from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv("../assets/auto-mpg.csv") # mpg - расход топлива (miles per gallon)

data = data.drop(["car name", "origin"], axis=1)

data["horsepower"] = data["horsepower"].replace({"?": np.nan}).astype(np.float32)
mean_hp = np.nanmean(data["horsepower"])
data["horsepower"] = data["horsepower"].fillna(mean_hp).astype(np.uint16)

data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
0,18.0,8,307.0,130,3504,12.0,70
1,15.0,8,350.0,165,3693,11.5,70
2,18.0,8,318.0,150,3436,11.0,70
3,16.0,8,304.0,150,3433,12.0,70
4,17.0,8,302.0,140,3449,10.5,70


In [4]:
y = data[["mpg"]].to_numpy().ravel()
X = data.drop(["mpg"], axis=1).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape

((318, 6), (80, 6))

## custom

In [5]:
from bagging import BaggingRegressor as CustomBaggingRegressor

In [6]:
custom_model = CustomBaggingRegressor(n_estimators=20)

In [7]:
t1_custom = time.now()

custom_model.fit(X_train, y_train)

t2_custom = time.now()

In [8]:
y_pred = custom_model.predict(X_test)
r2_score(y_pred, y_test)

0.8733422720002222

## sklearn

In [9]:
model = BaggingRegressor(n_estimators=20)

In [10]:
t1_sklearn = time.now()
model.fit(X_train, y_train)
t2_sklearn = time.now()

In [11]:
y_pred = model.predict(X_test)
r2_score(y_pred, y_test)

0.8764154795518277

## Comparison

In [12]:
print(f"Время, затраченное кастомным алгоритмом: {(t2_custom - t1_custom).microseconds / 1000:.3f} мс")
print(f"Время, затраченное Sklearn алгоритмом: {(t2_sklearn - t1_sklearn).microseconds / 1000:.3f} мс")

Время, затраченное кастомным алгоритмом: 324.568 мс
Время, затраченное Sklearn алгоритмом: 97.897 мс


In [93]:
def cross_validate(model, X, y, n_folds=5):
    scores = []
    for n in range(n_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train, y_train)
        scores += [r2_score(model.predict(X_test), y_test)]
    return scores

In [96]:
n_folds = 10

custom_scores = cross_validate(custom_model, X, y, n_folds=n_folds)
sklearn_scores = cross_validate(model, X, y, n_folds=n_folds)

In [97]:
print(f"Среднее R2-score для {n_folds} выборок у кастомного алгоритма: {np.mean(custom_scores):.3f}")
print(f"Среднее R2-score для {n_folds} выборок у Sklearn алгоритма: {np.mean(sklearn_scores):.3f}")

Среднее R2-score для 10 выборок у кастомного алгоритма: 0.938
Среднее R2-score для 10 выборок у Sklearn алгоритма: 0.834
