In [55]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston

In [56]:
boston = load_boston()
data = boston.data
target = boston.target
feature_names = boston.feature_names
x = pd.DataFrame(data, columns = feature_names)
y = pd.DataFrame(target, columns = ['price'])

In [57]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42) 

In [58]:
model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)

In [59]:
model.fit(x_train, y_train.values[:, 0])

RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)

In [60]:
y_pred = model.predict(x_test)

In [61]:
from sklearn.metrics import r2_score

In [62]:
r2_score(y_test, y_pred)

0.87472606157312

Модель посторенная методом RandomForestRegressor имеет коэффициент детерминации выше (0.875) чем модель построенная методом LinearRegression (0.711) => RandomForestRegressor лучше чем LinearRegression

In [63]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [64]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('mse = {}\nmae = {}'.format(mse, mae))

mse = 9.334548946165196
mae = 2.0831253879090608


In [65]:
y_train_pred = model.predict(x_train)

In [66]:
r2_score(y_train, y_train_pred)

0.9784468411299835

Но в случае RandomForestRegressor присутствует **огромная переобучаемость**.

## -------------

## *Задание 3
Вызовите документацию для класса RandomForestRegressor,
найдите информацию об атрибуте feature_importances_.
С помощью этого атрибута найдите сумму всех показателей важности,
установите, какие два признака показывают наибольшую важность.


In [67]:
help(RandomForestRegressor)

Help on class RandomForestRegressor in module sklearn.ensemble._forest:

class RandomForestRegressor(ForestRegressor)
 |  RandomForestRegressor(n_estimators=100, *, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest regressor.
 |  
 |  A random forest is a meta estimator that fits a number of classifying
 |  decision trees on various sub-samples of the dataset and uses averaging
 |  to improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------

In [68]:
-np.sort(-model.feature_importances_)[0:2]

array([0.41584732, 0.40268179])

In [69]:
features_weight = pd.DataFrame({'name_of_featurs':x_train.columns,
                               'weight':model.feature_importances_.flatten()},
                              columns = ['name_of_featurs', 'weight'])
features_weight.sort_values('weight', ascending=False).head(2)

Unnamed: 0,name_of_featurs,weight
12,LSTAT,0.415847
5,RM,0.402682
