## Загрузка датасета

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline
from heamy.dataset import Dataset
from sklearn.neural_network import MLPRegressor
from gmdhpy import gmdh
from warnings import simplefilter

simplefilter('ignore')

In [6]:
data = pd.read_csv('cars_price_2_processed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Location_Ahmedabad,Location_Bangalore,...,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual,Owner_Type_First,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,0,2010,72000,26.6,998,58.16,5.0,1.75,0,0,...,1,0,0,0,0,1,1,0,0,0
1,1,2015,41000,19.67,1582,126.2,5.0,12.5,0,0,...,0,1,0,0,0,1,1,0,0,0
2,2,2011,46000,18.2,1199,88.7,5.0,4.5,0,0,...,0,0,0,1,0,1,1,0,0,0
3,3,2012,87000,20.77,1248,88.76,7.0,6.0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,4,2013,40670,15.2,1968,140.8,5.0,17.74,0,0,...,0,1,0,0,1,0,0,0,1,0


In [7]:
print('Признаки, имеющие максимальную по модулю корреляцию с ценой автомобиля')
best_params = data.corr()['Price'].map(abs).sort_values(ascending=False)[1:]
best_params = best_params[best_params.values > 0.3]
best_params

Признаки, имеющие максимальную по модулю корреляцию с ценой автомобиля


Power                     0.772843
Engine                    0.658047
Transmission_Automatic    0.585623
Transmission_Manual       0.585623
Mileage                   0.341652
Fuel_Type_Diesel          0.321035
Fuel_Type_Petrol          0.309363
Name: Price, dtype: float64

 Разделение выборки на обучающую и тестовую

In [12]:
y = data['Price']
X = data[best_params.index]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

 Стекинг

In [21]:
dataset = Dataset(x_train, y_train, x_test)

In [22]:
# Создаем экземпляры трех различных моделей
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, name='lr')
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, 
                     parameters={'criterion': 'absolute_error', 'n_estimators': 1000, 'random_state': 17}, name='rf')
model_gb = Regressor(dataset=dataset, estimator=GradientBoostingRegressor, 
                     parameters={'loss': 'huber', 'random_state': 17}, name='rf')

In [23]:
pipeline = ModelsPipeline(model_lr, model_rf)
stack_ds = pipeline.stack(k=10, seed=1)
stacker = Regressor(dataset=stack_ds, estimator=GradientBoostingRegressor)
results = stacker.validate(k=10, scorer=mean_absolute_error)

Metric: mean_absolute_error
Folds accuracy: [2.118518079925378, 2.366942214889117, 2.654642030745365, 2.4726790895523836, 2.343767119086641, 2.341834955257515, 2.82948363821963, 2.09283849153532, 2.8587044048057866, 2.3338125113348562]
Mean accuracy: 2.441322253535199
Standard Deviation: 0.2516971648119532
Variance: 0.06335146277437553


In [24]:
y_pred_stack = stacker.predict()
print_metrics(y_test, y_pred_stack)

R^2: 0.7840610516996481
MSE: 25.21054291365927
MAE: 2.3494797099849425


 Многослойный персептрон

In [25]:
print_metrics(y_test, MLPRegressor(random_state=17).fit(x_train, y_train).predict(x_test))

R^2: 0.6461253526988189
MSE: 41.31432542421033
MAE: 3.6332677362127788


Подбор гиперпараметров

In [26]:
mlp = MLPRegressor(random_state=17)
params = {'solver': ['lbfgs', 'sgd', 'adam'], 'hidden_layer_sizes': [(100,), (50, 30,), (100, 40,)], 
          'alpha': [1e-4, 3e-4, 5e-4], 'max_iter': [500, 1000]}
grid_cv = GridSearchCV(estimator=mlp, cv=5, param_grid=params, n_jobs=-1, scoring='r2')
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_)

{'alpha': 0.0001, 'hidden_layer_sizes': (100, 40), 'max_iter': 1000, 'solver': 'lbfgs'}


In [27]:
best_mlp = grid_cv.best_estimator_
best_mlp.fit(x_train, y_train)
y_pred_mlp = best_mlp.predict(x_test)
print_metrics(y_test, y_pred_mlp)

R^2: 0.6795190063389385
MSE: 37.415667286044616
MAE: 3.4568890196314794


 Метод группового учёта аргументов

In [28]:
gm = gmdh.Regressor(n_jobs=-1)
gm.fit(np.array(x_train_scaled), np.array(y_train))
y_pred_gm = gm.predict(np.array(x_test_scaled))
print()
print_metrics(y_test, y_pred_gm)

train layer0 in 0.02 sec
train layer1 in 0.05 sec
train layer2 in 0.05 sec
train layer3 in 0.05 sec

R^2: 0.6866573748606544
MSE: 36.58227364693177
MAE: 3.3962429053289607


 Сравнение моделей

In [29]:


print("\nСтекинг")
print_metrics(y_test, y_pred_stack)

print("\nМногослойный персептрон")
print_metrics(y_test, y_pred_mlp)

print("\nМетод группового учёта аргументов")
print_metrics(y_test, y_pred_gm)

Случайный лес
R^2: 0.8208117147424516
MSE: 20.919959047068673
MAE: 2.1854572077185015

Градиентный бустинг
R^2: 0.7750617066966918
MSE: 26.26120271902374
MAE: 2.66518306958836

Стекинг
R^2: 0.7840610516996481
MSE: 25.21054291365927
MAE: 2.3494797099849425

Многослойный персептрон
R^2: 0.6795190063389385
MSE: 37.415667286044616
MAE: 3.4568890196314794

Метод группового учёта аргументов
R^2: 0.6866573748606544
MSE: 36.58227364693177
MAE: 3.3962429053289607
