In [48]:
#загрузим основные библиотеки
import pandas as pd
import numpy as np
from sklearn import pipeline
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import tree
 

from sklearn import metrics
from sklearn import model_selection
from sklearn import compose
import joblib

In [33]:
df_wine = pd.read_csv('data/Red.csv')

In [34]:
ct = compose.make_column_transformer(
    (preprocessing.StandardScaler(), ['Price']),
    (preprocessing.OneHotEncoder(), ['Country']),
    (preprocessing.OrdinalEncoder(), ['Region'])
)


In [35]:
pipe = pipeline.Pipeline([
    ('ct', ct),
    ('rf', ensemble.RandomForestRegressor(random_state=42))
])


In [36]:
X = df_wine[['Price', 'Country', 'Region']]
y = df_wine['Rating']

In [37]:
def rmse(y_hat, y):
    return metrics.mean_squared_error(y_hat, y, squared = False)

In [38]:
pipe.fit(X, y)


Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country']),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  ['Region'])])),
                ('rf', RandomForestRegressor(random_state=42))])

In [39]:
df_wine_test = pd.read_csv('data/Red_test.csv')

X_test = df_wine_test[['Price', 'Country', 'Region']]
y_test = df_wine_test['Rating']

In [40]:
X_test

Unnamed: 0,Price,Country,Region
0,15.50,France,Lirac
1,7.45,Italy,Toscana
2,8.72,Italy,Bardolino
3,29.15,Austria,Carnuntum
4,19.90,Italy,Toscana
...,...,...,...
275,486.42,Italy,Toscana
276,48.40,Italy,Toscana
277,7.25,Italy,Toscana
278,13.90,Italy,Toscana


In [41]:
y_pred = pipe.predict(X_test)

In [42]:
print(f'Качество по метрике R2: { round(metrics.r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по метрике R2: 0.9396
Качество по RSME: 0.0763


In [43]:
joblib.dump(pipe, 'pipeline_wine.plk')

['pipeline_wine.plk']

In [44]:
pipe2 = joblib.load('pipeline_wine.plk')

pipe.set_params(rf__n_estimators=200)

pipe.fit(X, y)

y_pred = pipe.predict(X_test)

In [45]:
print(f'Качество по метрике R2: { round(metrics.r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по метрике R2: 0.94
Качество по RSME: 0.076


In [50]:
estimators = [
    ('rc', linear_model.RidgeCV()),
    ('dt', tree.DecisionTreeRegressor(random_state=42))
]

pipe = pipeline.Pipeline([
    ('ct', ct),
    ('sr', ensemble.StackingRegressor(
        estimators=estimators,
        final_estimator=ensemble.RandomForestRegressor(
            n_estimators=10,
            random_state=42
        )
    ))
])

pipe.fit(X, y)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country']),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  ['Region'])])),
                ('sr',
                 StackingRegressor(estimators=[('rc',
                                                RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
                                               ('dt',
                                                DecisionTreeRegressor(random_state=42))],
                                   final_estimator=RandomForestRegressor(n_estimators=10,
                          

In [51]:
y_pred = pipe.predict(X_test)

print(f'Качество по метрике R2: { round(metrics.r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по метрике R2: 0.6653
Качество по RSME: 0.1795
