# Sklearn pipeline

Zbiór danych: https://www.kaggle.com/datasets/mrsimple07/restaurants-revenue-prediction/data

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
# puść ten kod, 
# jeżeli wywołujesz plik  w folderze rozwiąznaia, 
# a ramka danych znajduje się w folderze data
import os 
os.chdir('../')

In [3]:
# wczytatnie danych
df= pd.read_csv('data/Restaurant_revenue.csv')

In [None]:
# head
df.head()

In [None]:
# info
df.info()

In [None]:
# korelacja
df.select_dtypes(exclude= 'object').corr()

In [None]:
# Zmienne kategoryczne
cat_features = df.select_dtypes(include='object').columns
cat_features

In [None]:
# Zmienne numeryczne
num_features = df.select_dtypes(exclude = 'object').columns[:-1]
num_features

In [8]:
# train/test split
train_x, test_x, train_y, test_y = train_test_split(df.drop('Monthly_Revenue',axis=1),df['Monthly_Revenue'])

In [9]:
# pipeline
pipe = Pipeline([('scaler',MinMaxScaler()),('knn',KNeighborsRegressor(n_neighbors=10))])

In [None]:
# fitowanie
pipe.fit(train_x[num_features],train_y)

In [None]:
# score
pipe.score(test_x[num_features],test_y)

In [12]:
# predykcje - test
test_pred = pipe.predict(test_x[num_features])

In [None]:
# mape - test
mean_absolute_percentage_error(test_y, test_pred)

In [14]:
# Połączenie transformacji w ColumnTransformer
preprocessor = ColumnTransformer( transformers=[('num', MinMaxScaler(),num_features),
                                                ('cat',OneHotEncoder(),cat_features)])

In [15]:
# Pipeline: preprocessing -> model regresji
model = Pipeline([('preprocessor',preprocessor),
                  ("regressor",KNeighborsRegressor(n_neighbors=10))])



In [None]:
# Trenowanie modelu
model.fit(train_x,train_y)

In [None]:
# score
model.score(test_x,test_y)