## Импорт библиотек

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR, LinearSVC

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# from pycaret.regression import *

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, RocCurveDisplay

import warnings
warnings.filterwarnings("ignore")

## Применение древовидных моделей для решения задачи регрессии

#### Чтение датасета

In [2]:
regression_df = pd.read_csv("../data/preprocessed_moldova_cars_dataset.csv")

In [3]:
del regression_df["Unnamed: 0"]

In [4]:
regression_df.head(2)

Unnamed: 0,make,model,year,style,distance,engine_capacity,fuel_type,transmission,price
0,79,570,2011.0,4,195000.0,1800.0,2,0,7750.0
1,66,383,2014.0,11,135000.0,1500.0,0,1,8550.0


#### Разбиение на обучающую и тестовую выборки

In [5]:
y = regression_df.loc[:, "price"]
X = regression_df.drop("price", axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Функция для оценки качества обученной модели

In [7]:
def eval_model_quality(y_test: np.ndarray, y_pred: np.ndarray) -> None:
    print(f"[+] MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"[+] MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"[+] RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"[+] MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")

#### Baseline-модель для оценки качества последующих моделей

In [8]:
eval_model_quality(y_test, [np.mean(y_test)] * len(y_test))

[+] MAE: 6350.321611549966
[+] MSE: 99776619.26127224
[+] RMSE: 9988.824718718026
[+] MAPE: 1.6006386519733338


In [9]:
eval_model_quality(y_test, [np.median(y_test)] * len(y_test))

[+] MAE: 5830.830095488969
[+] MSE: 107644097.4943475
[+] RMSE: 10375.167347775528
[+] MAPE: 1.0683666750495469


#### Обучение модели `DecisionTreeRegressor`

In [10]:
dtr = DecisionTreeRegressor(max_depth=3)
dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {dtr.score(X_test, y_test)}")

[+] MAE: 3946.5889321110644
[+] MSE: 47043948.501551285
[+] RMSE: 6858.859125361249
[+] MAPE: 0.726512293213625
[+] R^2-score: 0.5285072910882727


#### Обучение модели `BaggingRegressor`

In [11]:
br = BaggingRegressor(n_estimators=1)
br.fit(X_train, y_train)

y_pred = br.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {br.score(X_test, y_test)}")

[+] MAE: 2784.97258076318
[+] MSE: 50210152.96655786
[+] RMSE: 7085.912288940491
[+] MAPE: 0.47001629693252306
[+] R^2-score: 0.49677436118496887


#### Обучение модели `GradientBoostingRegressor`

In [12]:
gbr = GradientBoostingRegressor(n_estimators=16)
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {gbr.score(X_test, y_test)}")

[+] MAE: 3480.4105363534604
[+] MSE: 53734171.66548624
[+] RMSE: 7330.359586369978
[+] MAPE: 0.8380382160455829
[+] R^2-score: 0.46145527816712806


#### Обучение модели `StackingRegressor`

In [13]:
estimators = [
    ("lr", RidgeCV()),
    ("svr", LinearSVR(random_state=42)),
]

sr = StackingRegressor(estimators=estimators)
sr.fit(X_train, y_train)

y_pred = sr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {sr.score(X_test, y_test)}")

[+] MAE: 13006.220530838436
[+] MSE: 233496612.4755339
[+] RMSE: 15280.595946347574
[+] MAPE: 4.140332843550931
[+] R^2-score: -1.3401936666555745


#### Обучение модели `CatBoostRegressor`

In [14]:
cbr = CatBoostRegressor(cat_features=["make", "model", "style", "fuel_type", "transmission"], verbose=False)
cbr.fit(X_train, y_train)

y_pred = cbr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {cbr.score(X_test, y_test)}")

[+] MAE: 4412.894802577889
[+] MSE: 247067497.92097202
[+] RMSE: 15718.380893748948
[+] MAPE: 1.2204286467820753
[+] R^2-score: -1.47620634724061


#### Обучение модели `XGBRegressor`

In [15]:
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

y_pred = xgbr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {xgbr.score(X_test, y_test)}")

[+] MAE: 2859.8642869335854
[+] MSE: 4054760889.1813707
[+] RMSE: 63677.004398616074
[+] MAPE: 0.7218478180579773
[+] R^2-score: -39.63838722139591


#### Обучение модели `LGBMRegressor`

In [16]:
lgbmr = LGBMRegressor()
lgbmr.fit(X_train, y_train)

y_pred = lgbmr.predict(X_test)

eval_model_quality(y_test, y_pred)
print(f"[+] R^2-score: {lgbmr.score(X_test, y_test)}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 860
[LightGBM] [Info] Number of data points in the train set: 27330, number of used features: 8
[LightGBM] [Info] Start training from score 9684.002452
[+] MAE: 3574.850195040415
[+] MSE: 254527679.45964706
[+] RMSE: 15953.923638392127
[+] MAPE: 0.8520809803129872
[+] R^2-score: -1.5509751818023423


## Применение древовидных моделей для решения задачи классификации

#### Чтение датасета

In [17]:
classification_df = pd.read_csv("../data/preprocessed_airlines_dataset.csv")

In [18]:
del classification_df["Unnamed: 0"]

In [19]:
classification_df.head(2)

Unnamed: 0,airline,flight,airport_from,airport_to,day_of_week,time,length,delay
0,16,3036,135,54,4.0,1195,131.0,0
1,13,315,79,207,7.0,707,145.0,0


#### Разбиение на обучающую и тестовую выборки

In [20]:
y = classification_df.loc[:, "delay"]
X = classification_df.drop("delay", axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Функция для оценки качества обученной модели

In [22]:
def eval_model_quality(y_test: np.ndarray, y_pred: np.ndarray) -> None:
    print("[+] Accuracy:", accuracy_score(y_test, y_pred))
    print("[+] Confusion matrix:", *confusion_matrix(y_test, y_pred))
    print("[+] Classification report:", classification_report(y_test, y_pred))

#### Baseline-модель для оценки качества последующих моделей

In [23]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr.score(X_test, y_test)

0.7473899342575533

#### Обучение модели `DecisionTreeClassifier`

In [24]:
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7661763669495706
[+] Confusion matrix: [71181  3068] [20157  4921]
[+] Classification report:               precision    recall  f1-score   support

           0       0.78      0.96      0.86     74249
           1       0.62      0.20      0.30     25078

    accuracy                           0.77     99327
   macro avg       0.70      0.58      0.58     99327
weighted avg       0.74      0.77      0.72     99327



#### Обучение модели `BaggingClassifier`

In [25]:
bc = BaggingClassifier(n_estimators=1)
bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.695792684768492
[+] Confusion matrix: [60198 14051] [16165  8913]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.81      0.80     74249
           1       0.39      0.36      0.37     25078

    accuracy                           0.70     99327
   macro avg       0.59      0.58      0.59     99327
weighted avg       0.69      0.70      0.69     99327



#### Обучение модели `GradientBoostingClassifier`

In [26]:
gbc = GradientBoostingClassifier(n_estimators=16)
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7680389018091758
[+] Confusion matrix: [72726  1523] [21517  3561]
[+] Classification report:               precision    recall  f1-score   support

           0       0.77      0.98      0.86     74249
           1       0.70      0.14      0.24     25078

    accuracy                           0.77     99327
   macro avg       0.74      0.56      0.55     99327
weighted avg       0.75      0.77      0.70     99327



#### Обучение модели `StackingClassifier`

In [27]:
estimators = [
    ("rf", RandomForestClassifier()),
]

sc = StackingClassifier(estimators=estimators)
sc.fit(X_train, y_train)

y_pred = sc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7573368771834446
[+] Confusion matrix: [71206  3043] [21060  4018]
[+] Classification report:               precision    recall  f1-score   support

           0       0.77      0.96      0.86     74249
           1       0.57      0.16      0.25     25078

    accuracy                           0.76     99327
   macro avg       0.67      0.56      0.55     99327
weighted avg       0.72      0.76      0.70     99327



#### Обучение модели `CatBoostClassifier`

In [28]:
cbc = CatBoostClassifier(verbose=False)
cbc.fit(X_train, y_train)

y_pred = cbc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7787107231669134
[+] Confusion matrix: [71153  3096] [18884  6194]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.96      0.87     74249
           1       0.67      0.25      0.36     25078

    accuracy                           0.78     99327
   macro avg       0.73      0.60      0.61     99327
weighted avg       0.76      0.78      0.74     99327



#### Обучение модели `XGBClassifier`

In [29]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

y_pred = xgbc.predict(X_test)

eval_model_quality(y_test, y_pred)

[+] Accuracy: 0.7776938798111289
[+] Confusion matrix: [71047  3202] [18879  6199]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.96      0.87     74249
           1       0.66      0.25      0.36     25078

    accuracy                           0.78     99327
   macro avg       0.72      0.60      0.61     99327
weighted avg       0.76      0.78      0.74     99327



#### Обучение модели `LGBMClassifier`

In [30]:
lgbmc = LGBMClassifier()
lgbmc.fit(X_train, y_train)

y_pred = lgbmc.predict(X_test)

eval_model_quality(y_test, y_pred)

[LightGBM] [Info] Number of positive: 75133, number of negative: 222847
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011858 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1161
[LightGBM] [Info] Number of data points in the train set: 297980, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.252141 -> initscore=-1.087226
[LightGBM] [Info] Start training from score -1.087226
[+] Accuracy: 0.7753984314436155
[+] Confusion matrix: [71409  2840] [19469  5609]
[+] Classification report:               precision    recall  f1-score   support

           0       0.79      0.96      0.86     74249
           1       0.66      0.22      0.33     25078

    accuracy                           0.78     99327
   macro avg       0.72      0.59      0.60     99327
weighted avg       0.75      0.78      0.73     99327



## Заключение

- Лучшая регрессионная модель: `BaggingRegressor`
- Лучшая классификационная модель: `XGBClassifier`