# ML-8. Pipelines

In [None]:
# загрузим основные библиотеки
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score


Попробуем предсказать цену на недвижимость в Калифорнии

## Часть 1. Простейшие пайплайны


Загрузим данные

In [None]:
data = fetch_california_housing()


In [None]:
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df.loc[:, 'target'] = data['target']
df.describe()


In [None]:
def rmse(y_hat, y):
    return mean_squared_error(y_hat, y, squared=False)


In [None]:
X = df.drop('target', axis=1)
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42)


In [None]:
print(f'Размер обучающей выборки {X_train.shape}')
print(f'Размер тестовой выборки {X_test.shape}')


In [None]:
pipeline = Pipeline([('scaler', StandardScaler()),
                    ('rf', RandomForestRegressor())])
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)
print(f'Качество по метрике R2: { round(r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')


In [None]:
pipeline.get_params()


In [None]:
print(pipeline[1].n_estimators)
print(pipeline['rf'].n_estimators)


In [None]:
pipeline.set_params(rf__n_estimators=200)


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'scaler__with_mean': [True, False],
    'rf__n_estimators': [100, 200, 500]
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=True)


In [None]:
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)


In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)
print(f'Качество по метрике R2: { round(r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')


#### Так же можно создавать pipeline через встроеный метод sklearn

In [None]:
from sklearn.pipeline import make_pipeline
new_pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [None]:
new_pipeline.get_params()

#### Установка параметров пайпа происходит через метод set_params()

In [None]:
new_pipeline.set_params(randomforestregressor__max_depth = 4)

# Часть 2. Предобработка в пайплайнах

In [14]:
df_wine = pd.read_csv('./data/Red.csv')


In [15]:
df_wine.head()


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [16]:
df_wine.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 541.8+ KB


In [17]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
ct = make_column_transformer(
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country']))
print(ct)


ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])


In [18]:
pipeline = Pipeline([('ct', ct), ('rf', RandomForestRegressor())])


In [19]:
X = df_wine[['Country', 'Price']]
y = df_wine['Rating']


In [20]:
pipeline.fit(X, y)


In [21]:
pd.DataFrame(
    pipeline['ct'].transform(X).toarray(), columns=[
    'Price'] + pipeline['ct'].transformers_[1][1].get_feature_names_out().tolist())


Unnamed: 0,Price,Country_Argentina,Country_Australia,Country_Austria,Country_Brazil,Country_Bulgaria,Country_Canada,Country_Chile,Country_China,Country_Croatia,...,Country_Portugal,Country_Romania,Country_Slovakia,Country_Slovenia,Country_South Africa,Country_Spain,Country_Switzerland,Country_Turkey,Country_United States,Country_Uruguay
0,0.657648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.278402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.373184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.358231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.117684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8661,-0.266981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8662,-0.224358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8663,-0.178910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8664,-0.387784,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
import joblib
!pip install joblib
joblib.dump(pipeline, 'pipeline.pkl')




['pipeline.pkl']

In [23]:
pipeline_loaded = joblib.load('pipeline.pkl')


In [24]:
print(pipeline_loaded)


Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country'])])),
                ('rf', RandomForestRegressor())])


---

### Pipeline practice

задание 6.1

Предлагаем использовать датасет, с которым вы работали при создании пайплайна (файл Red.csv).
Вам следует выполнить следующее:
• Добавить обработку столбца 'Region' в пайплайн, полученный ранее в модуле, с использованием OrdinalEncoder.

Важно! Для совпадения результатов процесс трансформации столбцов должен выполняться в следующей последовательности:
1. Кодирование столбца 'Region'.
2. Стандартизация столбца 'Price'.
3. Кодирование столбца 'Country'.

• Обучить на тренировочном наборе данных пайплайн и оценить качество модели по метрике RMSE на тестовом наборе (файл Red_test.csv).
• Зафиксировать random_state=42 .
• Сохранить пайплайн в файл pipeline_wine.pkl.
В качестве ответа на задание введите в поле ниже полученный результат по метрике RMSE, округленный до четвёртого знака после запятой.

In [25]:
data = pd.read_csv('./data/Red.csv')

In [26]:
data.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [27]:
X_train = data[['Region', 'Price', 'Country']]
y_train = data['Rating']

In [28]:
from sklearn.preprocessing import OrdinalEncoder

ct_prac = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country'])
)

pipeline_prac = Pipeline([
    ('ct_prac', ct_prac),
    ('rfr', RandomForestRegressor(random_state=42))
])

pipeline_prac.fit(X_train, y_train)



In [29]:
data_test = pd.read_csv('./data/Red_test.csv')

In [30]:
data_test.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
1,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
2,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
3,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016
4,Capatosta 2015,Italy,Toscana,Poggio Argentiera,3.8,101,19.9,2015


In [31]:
X_test = data_test[['Region', 'Price', 'Country']]
y_test = data_test['Rating']

In [32]:
y_pred = pipeline_prac.predict(X_test)

In [33]:
print(f'RMSE : {mean_squared_error(y_test, y_pred, squared=False).round(4)}')

RMSE : 0.0765


In [34]:
joblib.dump(pipeline_prac, 'pipeline_wine.pkl')

['pipeline_wine.pkl']

> ---

Задание 6.2

Теперь попробуем изменить параметры случайного леса в пайплайне, полученном в предыдущем задании.
Измените параметр n_estimators в случайном лесу со значения по умолчанию до 200 , используя метод set_params
В качестве ответа на задание введите в поле ниже полученный результат по метрике RMSE, округленный до четвёртого знака после запятой.

In [35]:
pipeline_prac.set_params(rfr__n_estimators=200)

In [36]:
pipeline_prac.fit(X_train, y_train)

In [37]:
y_pred = pipeline_prac.predict(X_test)
print(mean_squared_error(y_test, y_pred, squared=False).round(4))

0.0761


>____

Задание 6.3

Теперь попробуем добавить стекинг в качестве модели в пайплайн.
Вам следует выполнить следующее:
• Собрать StackingRegressor:
1. В качестве базовых моделей возьмите ридж-регрессию RidgeCV() и решающее дерево.
2. В качестве метамодели возьмите случайный лес с настройками (количество базовых моделей 10).
3. Все базовые модели стекинга модели должны быть с настройками по умолчанию (кроме random_state ).
• Зафиксировать random_state=42 (для всех моделей).
• Заменить в пайплайне задачи 6.1 случайный лес на StackingRegressor.
• Обучить модель на тренировочной выборке.
В качестве ответа на задание введите в поле ниже полученный результат по метрике RMSE, округлённый до второго знака после запятой.

In [42]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor

estimators = [
    ('lr', RidgeCV()),
    ('dt',  DecisionTreeRegressor(random_state=42))
]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(
        n_estimators=10,
        random_state=42
    )
)


In [43]:
data_train = pd.read_csv('./data/Red.csv')
data_test = pd.read_csv('./data/Red_test.csv')

X_train = data_train[['Region', 'Price', 'Country']]
X_test = data_test[['Region', 'Price', 'Country']]
y_train = data_train['Rating']
y_test = data_test['Rating']

In [44]:
ct = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country'])
)

In [45]:
pipeline = Pipeline([
    ('ct', ct),
    ('reg', reg)
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(mean_squared_error(y_test, y_pred, squared=False).round(2))

0.18
