# ML-8. Pipelines

In [73]:
#загрузим основные библиотеки
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


from sklearn.datasets import make_regression
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score


Попробуем предсказать цену на недвижимость в Калифорнии

In [74]:
arr, y = make_regression(n_features=3, n_samples=10)
df = pd.DataFrame(arr)
df['y'] = y
df

Unnamed: 0,0,1,2,y
0,0.593478,-0.13109,1.765909,192.763046
1,0.011679,0.610439,-0.705852,-41.477912
2,0.674242,-1.398995,0.624257,52.903425
3,0.974498,-1.060305,0.601908,82.414133
4,0.652949,0.24443,0.015478,53.123655
5,1.018619,-1.3229,0.412113,59.43716
6,-0.05136,-0.133197,0.252716,14.623997
7,-0.504775,1.013033,0.462104,42.267096
8,1.402379,-0.127987,0.818646,162.009098
9,0.136605,-0.291947,-1.943866,-174.54993


## Часть 1. Простейшие пайплайны


Загрузим данные

In [75]:
data = fetch_california_housing()

In [89]:
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df.loc[:,'target'] = data['target']
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [77]:
def rmse(y_hat, y):
    return mean_squared_error(y_hat, y, squared = False)

In [78]:
X = df.drop('target', axis=1)
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42)


In [79]:
print(f'Размер обучающей выборки {X_train.shape}')
print(f'Размер тестовой выборки {X_test.shape}')

Размер обучающей выборки (15480, 8)
Размер тестовой выборки (5160, 8)


In [80]:
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor())])
pipeline.fit(X_train, y_train)


In [81]:
y_pred = pipeline.predict(X_test)
print(f'Качество по метрике R2: { round(r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по метрике R2: 0.8072
Качество по RSME: 0.5051


In [82]:
pipeline.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('rf', RandomForestRegressor())],
 'verbose': False,
 'scaler': StandardScaler(),
 'rf': RandomForestRegressor(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__criterion': 'squared_error',
 'rf__max_depth': None,
 'rf__max_features': 1.0,
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

In [83]:
print(pipeline[1].n_estimators)
print(pipeline['rf'].n_estimators)

100
100


In [84]:
pipeline.set_params(rf__n_estimators=200)

In [85]:
from sklearn.model_selection import GridSearchCV
param_grid = {'scaler__with_mean':[True,False],
              'rf__n_estimators':[100, 200, 500]}
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose = True)


In [86]:
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)
print(f'Качество по метрике R2: { round(r2_score(y_test, y_pred),4)}')
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по метрике R2: 0.81
Качество по RSME: 0.5015


# Часть 2. Предобработка в пайплайнах

In [None]:
df_wine = pd.read_csv('../data/Red.csv')

In [None]:
df_wine.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [None]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 541.8+ KB


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
ct = make_column_transformer(
     (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country']))
print(ct)


ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])


# ко всем столбцам какого то типа

In [None]:
from sklearn.compose import make_column_selector
import numpy as np

ct_filtered = make_column_transformer(
       (StandardScaler(), make_column_selector(dtype_include=np.number)),
       (OneHotEncoder(), make_column_selector(dtype_include=object))
)

In [None]:
pipeline = Pipeline([('ct', ct), ('rf', RandomForestRegressor())])

In [None]:
pipeline.set_params(rf__max_depth = 3)

In [None]:
X = df_wine[['Country', 'Price']]
y = df_wine['Rating']

In [None]:
pipeline.fit(X, y)

In [None]:
pipeline['ct'].transformers_[1][1].get_feature_names_out().tolist()

['Country_Argentina',
 'Country_Australia',
 'Country_Austria',
 'Country_Brazil',
 'Country_Bulgaria',
 'Country_Canada',
 'Country_Chile',
 'Country_China',
 'Country_Croatia',
 'Country_France',
 'Country_Georgia',
 'Country_Germany',
 'Country_Greece',
 'Country_Hungary',
 'Country_Israel',
 'Country_Italy',
 'Country_Lebanon',
 'Country_Mexico',
 'Country_Moldova',
 'Country_New Zealand',
 'Country_Portugal',
 'Country_Romania',
 'Country_Slovakia',
 'Country_Slovenia',
 'Country_South Africa',
 'Country_Spain',
 'Country_Switzerland',
 'Country_Turkey',
 'Country_United States',
 'Country_Uruguay']

In [None]:
pd.DataFrame(pipeline['ct'].transform(X).toarray(), columns = ['Price']+ pipeline['ct'].transformers_[1][1].get_feature_names_out().tolist())

Unnamed: 0,Price,Country_Argentina,Country_Australia,Country_Austria,Country_Brazil,Country_Bulgaria,Country_Canada,Country_Chile,Country_China,Country_Croatia,...,Country_Portugal,Country_Romania,Country_Slovakia,Country_Slovenia,Country_South Africa,Country_Spain,Country_Switzerland,Country_Turkey,Country_United States,Country_Uruguay
0,0.657648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.278402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.373184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.358231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.117684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8661,-0.266981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8662,-0.224358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8663,-0.178910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8664,-0.387784,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import joblib
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']

In [None]:
pipeline_loaded = joblib.load('pipeline.pkl')


In [None]:
print(pipeline_loaded)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country'])])),
                ('rf', RandomForestRegressor(max_depth=3))])


In [None]:
pipeline_loaded[1].get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 3,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

***

In [91]:
df = pd.read_csv('../data/Red.csv')
df

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.00,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.50,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016
...,...,...,...,...,...,...,...,...
8661,6th Sense Syrah 2016,United States,Lodi,Michael David Winery,3.8,994,16.47,2016
8662,Botrosecco Maremma Toscana 2016,Italy,Maremma Toscana,Le Mortelle,4.0,995,20.09,2016
8663,Haut-Médoc 2010,France,Haut-Médoc,Château Cambon La Pelouse,3.7,996,23.95,2010
8664,Shiraz 2019,Australia,South Eastern Australia,Yellow Tail,3.5,998,6.21,2019


In [126]:
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
ct = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country'])
)
pl = Pipeline([
    ('ct', ct),
    ('rf', RandomForestRegressor(random_state=42))
])


In [127]:
X = df_wine[['Country', 'Region','Price']]
y = df_wine['Rating']

In [146]:
pl.set_params(rf__n_estimators = 200)

In [147]:
pl.fit(X, y)

In [178]:
joblib.dump(pl, 'pipeline_3_trans.pkl')

['pipeline_3_trans.pkl']

In [148]:
df_test = pd.read_csv('../data/Red_test.csv')
X_test = df_test[['Country','Price','Region']]
y_test = df_test['Rating']

In [149]:
print(f"pseudo rmse on test: {round(rmse(y_test, pl.predict(X_test)), 4)}")   

pseudo rmse on test: 0.0761


In [150]:
X = df_wine[['Country', 'Region','Price']]
y = df_wine['Rating']
df_test = pd.read_csv('../data/Red_test.csv')
X_test = df_test[['Country','Price','Region']]
y_test = df_test['Rating']

## Stacking model

In [175]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor
SRmodel = StackingRegressor(
    estimators=[('lr', RidgeCV()),
     ('dt', DecisionTreeRegressor(random_state=42))
     ],
    final_estimator=RandomForestRegressor(n_estimators=100, random_state=42)
)
pl_stacked = Pipeline([
    ('ct', make_column_transformer(
        (OrdinalEncoder(), ['Region']),
        (StandardScaler(), ['Price']),
        (OneHotEncoder(), ['Country'])
    )),
    ('model', SRmodel)  
])

In [176]:
pl_stacked.fit(X, y)

In [177]:
print(f"pseudo rmse on test: {round(rmse(y_test, pl_stacked.predict(X_test)), 4)}")   

pseudo rmse on test: 0.1762


# Gradient Boosting model

In [172]:
from sklearn.ensemble import GradientBoostingRegressor
GBmodel = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.1,
    subsample=1,
    min_samples_leaf=3,
    max_depth=3,
    warm_start=True
)
pl_boost = Pipeline([
    ('ct', make_column_transformer(
        (OrdinalEncoder(), ['Region']),
        (StandardScaler(), ['Price']),
        (OneHotEncoder(), ['Country'])
    )),
    ('model', GBmodel)  
])

In [173]:
pl_boost.fit(X,y)   

In [174]:
print(f"pseudo rmse on test: {round(rmse(y_test, pl_boost.predict(X_test)), 4)}")   

pseudo rmse on test: 0.1658
