In [1]:
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import BaggingRegressor

In [2]:
df = pd.read_csv('../data/Clean_Dataset.csv')

In [3]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('flight', axis=1)

df['class'] = df['class'].apply(lambda x: 1 if x == 'Business' else 0)

In [4]:
df.stops = pd.factorize(df.stops)[0]

In [5]:
df = df.join(pd.get_dummies(df.airline, prefix='airline')).drop('airline', axis=1)
df = df.join(pd.get_dummies(df.source_city, prefix='source_city')).drop('source_city', axis=1)
df = df.join(pd.get_dummies(df.destination_city, prefix='destination_city')).drop('destination_city', axis=1)
df = df.join(pd.get_dummies(df.arrival_time, prefix='arrival_time')).drop('arrival_time', axis=1)
df = df.join(pd.get_dummies(df.departure_time, prefix='departure_time')).drop('departure_time', axis=1)

In [6]:
X = df.drop(['price'], axis=1).to_numpy()
y = df['price'].to_numpy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape

((240122, 34), (240122,))

In [8]:
from Regressor import BaggingRegressor as BR

model1 = BR(n_estimators=20)

start1 = time()
model1.fit(X_train, y_train)
end1 = time()

In [16]:
y_pred = model1.predict(X_test)
print(f'{r2_score(y_pred, y_test):.4f}')

0.9839


In [10]:
model2 = BaggingRegressor(n_estimators=20)

start2 = time()
model2.fit(X_train, y_train)
end2 = time()

In [18]:
y_pred = model2.predict(X_test)
print(f'{r2_score(y_pred, y_test):.4f}')

0.9944


In [12]:
print(f"Время обучения кастомного алгоритма: {(end1 - start1):.2f} с")
print(f"Время обучения библиотечного алгоритма: {(end2 - start2):.2f} с")

Время обучения кастомного алгоритма: 46.91 с
Время обучения библиотечного алгоритма: 37.40 с


In [13]:
def cross_validate(model, X, y, n_folds=5):
    scores = []
    for n in range(n_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train, y_train)
        scores += [r2_score(model.predict(X_test), y_test)]
    return scores

In [14]:
custom_scores = cross_validate(model1, X, y, n_folds=10)
sklearn_scores = cross_validate(model2, X, y, n_folds=10)

In [15]:
print(f"Средний R2 на {10} выборках у кастомного алгоритма: {np.mean(custom_scores):.3f}")
print(f"Средний R2 на {10} выборках у библиотечного алгоритма: {np.mean(sklearn_scores):.3f}")

Средний R2 на 10 выборках у кастомного алгоритма: 0.995
Средний R2 на 10 выборках у библиотечного алгоритма: 0.985
