## Импорт библиотек

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR, LinearSVC

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# from pycaret.regression import *

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, RocCurveDisplay

import warnings
warnings.filterwarnings("ignore")

## Применение древовидных моделей для решения задачи регрессии

#### Чтение датасета

In [2]:
regression_df = pd.read_csv("../data/preprocessed_moldova_cars_dataset.csv")

In [3]:
del regression_df["Unnamed: 0"]

In [4]:
regression_df.head(2)

Unnamed: 0,make,model,year,style,distance,engine_capacity,fuel_type,transmission,price
0,79,570,2011.0,4,195000.0,1800.0,2,0,7750.0
1,66,383,2014.0,11,135000.0,1500.0,0,1,8550.0


#### Разбиение на обучающую и тестовую выборки

In [5]:
y = regression_df.loc[:, "price"]
X = regression_df.drop("price", axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Функция для оценки качества обученной модели

In [7]:
def eval_model_quality(y_test: np.ndarray, y_pred: np.ndarray) -> None:
    print(f"[+] MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"[+] MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"[+] RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"[+] MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")

#### Baseline-модель для оценки качества последующих моделей

In [8]:
eval_model_quality(y_test, [np.mean(y_test)] * len(y_test))

[+] MAE: 6541.619242021724
[+] MSE: 136389735.67596138
[+] RMSE: 11678.601614746578
[+] MAPE: 1.9125926316890582


In [9]:
eval_model_quality(y_test, [np.median(y_test)] * len(y_test))

[+] MAE: 5981.292503567117
[+] MSE: 144760164.59686095
[+] RMSE: 12031.631834329912
[+] MAPE: 1.270316490087161


#### Обучение модели `DecisionTreeRegressor`

In [10]:
dtr = DecisionTreeRegressor(max_depth=3)
dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {dtr.score(X_test, y_test)}")

[+] MAE: 4047.084412245656
[+] MSE: 82177193.11690244
[+] RMSE: 9065.163711533423
[+] MAPE: 0.8400826425500367
[+] R^2-score: 0.3974825692738245


#### Обучение модели `BaggingRegressor`

In [11]:
br = BaggingRegressor(n_estimators=1)
br.fit(X_train, y_train)

y_pred = br.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {br.score(X_test, y_test)}")

[+] MAE: 2832.48934046234
[+] MSE: 75774535.8364077
[+] RMSE: 8704.85702561551
[+] MAPE: 0.6849416821122677
[+] R^2-score: 0.44442640451745574


#### Обучение модели `GradientBoostingRegressor`

In [12]:
gbr = GradientBoostingRegressor(n_estimators=16)
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {gbr.score(X_test, y_test)}")

[+] MAE: 3558.418921983685
[+] MSE: 76883997.5752484
[+] RMSE: 8768.352044440757
[+] MAPE: 0.9145090016212851
[+] R^2-score: 0.43629190866744105


#### Обучение модели `StackingRegressor`

In [13]:
estimators = [
    ("lr", RidgeCV()),
    ("svr", LinearSVR(random_state=42)),
]

sr = StackingRegressor(estimators=estimators)
sr.fit(X_train, y_train)

y_pred = sr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {sr.score(X_test, y_test)}")

[+] MAE: 6527.716704589964
[+] MSE: 134412231.27498436
[+] RMSE: 11593.628908800918
[+] MAPE: 1.9206758516876283
[+] R^2-score: 0.014498923919577189


#### Обучение модели `CatBoostRegressor`

In [14]:
cbr = CatBoostRegressor(cat_features=["make", "model", "style", "fuel_type", "transmission"], verbose=False)
cbr.fit(X_train, y_train)

y_pred = cbr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {cbr.score(X_test, y_test)}")

[+] MAE: 2710.8420199190964
[+] MSE: 109651184.95357603
[+] RMSE: 10471.44617297802
[+] MAPE: 0.6019824433761582
[+] R^2-score: 0.19604518323806686


#### Обучение модели `XGBRegressor`

In [15]:
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

y_pred = xgbr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {xgbr.score(X_test, y_test)}")

[+] MAE: 2891.250059406447
[+] MSE: 5024579243.086061
[+] RMSE: 70884.26654121534
[+] MAPE: 0.7477005951929298
[+] R^2-score: -35.8398634852083


#### Обучение модели `LGBMRegressor`

In [16]:
lgbmr = LGBMRegressor()
lgbmr.fit(X_train, y_train)

y_pred = lgbmr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {lgbmr.score(X_test, y_test)}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 27330, number of used features: 8
[LightGBM] [Info] Start training from score 9654.576948
[+] MAE: 3695.6841414733467
[+] MSE: 354836192.32396454
[+] RMSE: 18837.09617547154
[+] MAPE: 0.8294080236323355
[+] R^2-score: -1.6016341373884213


## Применение древовидных моделей для решения задачи классификации

#### Чтение датасета

In [17]:
classification_df = pd.read_csv("../data/preprocessed_airlines_dataset.csv")

In [30]:
del classification_df["Unnamed: 0"]

In [32]:
classification_df.head(2)

Unnamed: 0,airline,flight,airport_from,airport_to,day_of_week,time,length,delay
0,16,3036,135,54,4.0,1195,131.0,0
1,13,315,79,207,7.0,707,145.0,0


#### Разбиение на обучающую и тестовую выборки

In [33]:
y = classification_df.loc[:, "delay"]
X = classification_df.drop("delay", axis=1)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Функция для оценки качества обученной модели

In [20]:
def eval_model_quality(y_test: np.ndarray, y_pred: np.ndarray) -> None:
    print("[+] Accuracy:", accuracy_score(y_test, y_pred))
    print("[+] Confusion matrix:", *confusion_matrix(y_test, y_pred))
    print("[+] Classification report:", classification_report(y_test, y_pred))

#### Baseline-модель для оценки качества последующих моделей

In [21]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr.score(X_test, y_test)

0.7485477261973079

#### Обучение модели `DecisionTreeClassifier`

In [22]:
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.766297180021545
[+] Confusion matrix: [71219  3136] [20077  4895]
[+] Classification report:               precision    recall  f1-score   support

           0       0.78      0.96      0.86     74355
           1       0.61      0.20      0.30     24972

    accuracy                           0.77     99327
   macro avg       0.69      0.58      0.58     99327
weighted avg       0.74      0.77      0.72     99327



#### Обучение модели `BaggingClassifier`

In [23]:
bc = BaggingClassifier(n_estimators=1)
bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.6768149647125152
[+] Confusion matrix: [57803 16552] [15549  9423]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.78      0.78     74355
           1       0.36      0.38      0.37     24972

    accuracy                           0.68     99327
   macro avg       0.58      0.58      0.58     99327
weighted avg       0.68      0.68      0.68     99327



#### Обучение модели `GradientBoostingClassifier`

In [24]:
gbc = GradientBoostingClassifier(n_estimators=16)
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7688342545330071
[+] Confusion matrix: [72812  1543] [21418  3554]
[+] Classification report:               precision    recall  f1-score   support

           0       0.77      0.98      0.86     74355
           1       0.70      0.14      0.24     24972

    accuracy                           0.77     99327
   macro avg       0.73      0.56      0.55     99327
weighted avg       0.75      0.77      0.71     99327



#### Обучение модели `StackingClassifier`

In [25]:
estimators = [
    ("rf", RandomForestClassifier()),
]

sc = StackingClassifier(estimators=estimators)
sc.fit(X_train, y_train)

y_pred = sc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7645957292579058
[+] Confusion matrix: [70478  3877] [19505  5467]
[+] Classification report:               precision    recall  f1-score   support

           0       0.78      0.95      0.86     74355
           1       0.59      0.22      0.32     24972

    accuracy                           0.76     99327
   macro avg       0.68      0.58      0.59     99327
weighted avg       0.73      0.76      0.72     99327



#### Обучение модели `CatBoostClassifier`

In [26]:
cbc = CatBoostClassifier(verbose=False)
cbc.fit(X_train, y_train)

y_pred = cbc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7779657092230713
[+] Confusion matrix: [71307  3048] [19006  5966]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.96      0.87     74355
           1       0.66      0.24      0.35     24972

    accuracy                           0.78     99327
   macro avg       0.73      0.60      0.61     99327
weighted avg       0.76      0.78      0.74     99327



#### Обучение модели `XGBClassifier`

In [27]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

y_pred = xgbc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7780764545390478
[+] Confusion matrix: [71180  3175] [18868  6104]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.96      0.87     74355
           1       0.66      0.24      0.36     24972

    accuracy                           0.78     99327
   macro avg       0.72      0.60      0.61     99327
weighted avg       0.76      0.78      0.74     99327



#### Обучение модели `LGBMClassifier`

In [35]:
lgbmc = LGBMClassifier()
lgbmc.fit(X_train, y_train)

y_pred = lgbmc.predict(X_test)

eval_model_quality(y_test, y_pred)

[LightGBM] [Info] Number of positive: 75148, number of negative: 222832
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1160
[LightGBM] [Info] Number of data points in the train set: 297980, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.252191 -> initscore=-1.086959
[LightGBM] [Info] Start training from score -1.086959
[+] Accuracy: 0.7742507072598589
[+] Confusion matrix: [71395  2869] [19554  5509]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.96      0.86     74264
           1       0.66      0.22      0.33     25063

    accuracy                           0.77     99327
   macro avg       0.72      0.59      0.60     99327
weighted avg       0.75      0.77      0.73     99327



## Заключение

- Лучшая регрессионная модель: `BaggingRegressor`
- Лучшая классификационная модель: `XGBClassifier`