# Лаборатораня работа №5: Ансамбли моделей машинного обучения.

## 1) Бибилиотеки, загрузка датасета, кодирование категориальных признаков

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline
from heamy.dataset import Dataset
from sklearn.neural_network import MLPRegressor
from gmdhpy import gmdh
from warnings import simplefilter

simplefilter('ignore')

In [4]:
df = pd.read_csv('../lab04/kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
df = df[:1000]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   int64  
 1   date           1000 non-null   object 
 2   price          1000 non-null   float64
 3   bedrooms       1000 non-null   int64  
 4   bathrooms      1000 non-null   float64
 5   sqft_living    1000 non-null   int64  
 6   sqft_lot       1000 non-null   int64  
 7   floors         1000 non-null   float64
 8   waterfront     1000 non-null   int64  
 9   view           1000 non-null   int64  
 10  condition      1000 non-null   int64  
 11  grade          1000 non-null   int64  
 12  sqft_above     1000 non-null   int64  
 13  sqft_basement  1000 non-null   int64  
 14  yr_built       1000 non-null   int64  
 15  yr_renovated   1000 non-null   int64  
 16  zipcode        1000 non-null   int64  
 17  lat            1000 non-null   float64
 18  long     

Определим категориальные признаки и закодируем их.

In [6]:
category_cols = ['date']
print('Количество уникальных значений\n')
for col in category_cols:
    print(f'{col}: {df[col].unique().size}')

Количество уникальных значений

date: 272


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['date'] = le.fit_transform(df[['date']])
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,119,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,160,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,210,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,160,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,204,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


## 2) Разделение выборки на обучающую и на тестовую

Для начала проведем корреляционный анализ, чтобы выявить признаки, имеющие наибольшее значение для прогнозирования успеваемости.

In [8]:
print('Признаки, имеющие максимальную по модулю корреляцию с итоговой оценкой')
best_params = df.corr()['price'].map(abs).sort_values(ascending=False)[1:]
best_params = best_params[best_params.values > 0.3]
best_params

Признаки, имеющие максимальную по модулю корреляцию с итоговой оценкой


sqft_living      0.704776
grade            0.647349
sqft_living15    0.645106
sqft_above       0.582407
bathrooms        0.487157
view             0.445316
sqft_basement    0.367365
lat              0.365770
waterfront       0.317143
bedrooms         0.307058
Name: price, dtype: float64

In [9]:
y = df['price']
X = df[best_params.index]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

## 3) Масштабирование даных

In [10]:
scaler = StandardScaler().fit(x_train)
x_train_scaled = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
x_test_scaled = pd.DataFrame(scaler.transform(x_test), columns=x_train.columns)

## 4) Метрики

In [11]:
def print_metrics(y_test, y_pred):
    print(f"R^2: {r2_score(y_test, y_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

## Модель №1: Случайный лес

In [12]:
rf = RandomForestRegressor(random_state=17)
print_metrics(y_test, rf.fit(x_train, y_train).predict(x_test))

R^2: 0.7654445247114293
MSE: 28199263612.49626
MAE: 99302.73867777777


Подбор гиперпараметров

In [13]:
params = {'n_estimators': [100, 1000], 'criterion': ['squared_error', 'absolute_error', 'poisson'],
          'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 3, 5]}
grid_cv = GridSearchCV(estimator=rf, cv=5, param_grid=params, n_jobs=-1, scoring='neg_mean_absolute_error')
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_)

{'criterion': 'absolute_error', 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100}


In [14]:
best_rf = grid_cv.best_estimator_
best_rf.fit(x_train, y_train)
y_pred_rf = best_rf.predict(x_test)
print_metrics(y_test, y_pred_rf)

R^2: 0.7776132847777704
MSE: 26736283170.339466
MAE: 96753.70763333334


## Модель №2: Градиентный бустинг

In [15]:
gb = GradientBoostingRegressor(random_state=17)
print_metrics(y_test, gb.fit(x_train, y_train).predict(x_test))

R^2: 0.7646153778071789
MSE: 28298947203.75618
MAE: 97955.24909419045


In [16]:
params = {'loss': ['squared_error', 'absolute_error', 'huber'], 'n_estimators': [10, 50, 100, 200],
          'criterion': ['friedman_mse', 'squared_error', 'mse', 'mae'], 'min_samples_leaf': [1, 3, 5]}
grid_cv = GridSearchCV(estimator=gb, cv=5, param_grid=params, n_jobs=-1, scoring='r2')
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'criterion': 'squared_error', 'loss': 'huber', 'min_samples_leaf': 1, 'n_estimators': 100}


In [17]:
best_gb = grid_cv.best_estimator_
best_gb.fit(x_train, y_train)
y_pred_gb = best_gb.predict(x_test)
print_metrics(y_test, y_pred_gb)

R^2: 0.776187052210596
MSE: 26907750956.73412
MAE: 92423.67488566117


## Модель №3: Стекинг

In [18]:
dataset = Dataset(x_train, y_train, x_test)
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, name='lr')
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'criterion': 'absolute_error', 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100}, name=rf)

In [19]:
pipeline = ModelsPipeline(model_lr, model_rf)
stack_ds = pipeline.stack(k=10, seed=1)
stacker = Regressor(dataset=stack_ds, estimator=GradientBoostingRegressor, parameters={'criterion': 'mae', 'loss': 'huber', 'min_samples_leaf': 3, 'n_estimators': 200})
results = stacker.validate(k=10, scorer=mean_absolute_error)

Metric: mean_absolute_error
Folds accuracy: [74532.61140030168, 87735.85281211081, 113550.14952901372, 80719.77169006542, 86039.7025657782, 85341.21222986304, 122962.81205954435, 104490.51130012181, 156685.78165463157, 66314.99226509455]
Mean accuracy: 97837.33975065252
Standard Deviation: 25715.254094394146
Variance: 661274293.1392549


In [20]:
y_pred_stack = stacker.predict()
print_metrics(y_test, y_pred_stack)

R^2: 0.68409352698156
MSE: 37979628906.897705
MAE: 109961.02346195404


## Модель №4: Многослойный персептрон

In [21]:
mlp = MLPRegressor(random_state=17)
print_metrics(y_test, mlp.fit(x_train, y_train).predict(x_test))

R^2: 0.20414962441088735
MSE: 95680540007.56516
MAE: 189816.71680723643


In [22]:
params = {'solver': ['lbfgs', 'sgd', 'adam'], 'hidden_layer_sizes': [(100,), (50, 30,), (100, 40,)],
          'alpha': [1e-4, 3e-4, 5e-4], 'max_iter': [500, 1000]}
grid_cv = GridSearchCV(estimator=mlp, cv=5, param_grid=params, n_jobs=-1, scoring='r2')
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
  return ((y_true - y_pred) ** 2).mean() / 2
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
STOP: TOTAL NO. of ITERATION

{'alpha': 0.0001, 'hidden_layer_sizes': (100, 40), 'max_iter': 1000, 'solver': 'lbfgs'}


In [23]:
best_mlp = grid_cv.best_estimator_
best_mlp.fit(x_train, y_train)
y_pred_mlp = best_mlp.predict(x_test)
print_metrics(y_test, y_pred_mlp)

R^2: 0.5150782554891344
MSE: 58299368573.980095
MAE: 157630.17146659532


## Модель №5: Метод группового учёта аргументов

In [24]:
gm = gmdh.Regressor(n_jobs=-1)
gm.fit(np.array(x_train_scaled), np.array(y_train))
y_pred_gm = gm.predict(np.array(x_test_scaled))

train layer0 in 0.10 sec
train layer1 in 0.15 sec
train layer2 in 0.14 sec
train layer3 in 0.12 sec
train layer4 in 0.11 sec
train layer5 in 0.11 sec
train layer6 in 0.11 sec
train layer7 in 0.12 sec
train layer8 in 0.12 sec
train layer9 in 0.12 sec
train layer10 in 0.12 sec
train layer11 in 0.13 sec


In [25]:
print_metrics(y_test, y_pred_gm)

R^2: 0.7079749729980879
MSE: 35108499205.75069
MAE: 121590.0769278112


## Сравнение моделей

In [26]:
print("Случайный лес")
print_metrics(y_test, y_pred_rf)

print("\nГрадиентный бустинг")
print_metrics(y_test, y_pred_gb)

print("\nСтекинг")
print_metrics(y_test, y_pred_stack)

print("\nМногослойный персептрон")
print_metrics(y_test, y_pred_mlp)

print("\nМетод группового учёта аргументов")
print_metrics(y_test, y_pred_gm)

Случайный лес
R^2: 0.7776132847777704
MSE: 26736283170.339466
MAE: 96753.70763333334

Градиентный бустинг
R^2: 0.776187052210596
MSE: 26907750956.73412
MAE: 92423.67488566117

Стекинг
R^2: 0.68409352698156
MSE: 37979628906.897705
MAE: 109961.02346195404

Многослойный персептрон
R^2: 0.5150782554891344
MSE: 58299368573.980095
MAE: 157630.17146659532

Метод группового учёта аргументов
R^2: 0.7079749729980879
MSE: 35108499205.75069
MAE: 121590.0769278112
