# Example: Pipelines usage

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('data/house_train.csv')
X_test = pd.read_csv('data/house_test.csv')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('SalePrice', axis=1)
y = train["SalePrice"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)

In [None]:
X_train.describe().T.iloc[:10] # All numerical cols

In [None]:
X_train.describe(include="object").T.iloc[:10] # All object cols

In [None]:
above_0_missing = X_train.isnull().sum() > 0

In [None]:
X_train.isnull().sum()[above_0_missing]

In [None]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()

In [None]:
print(f'There are {len(numerical_features)} numerical features:', '\n')

In [None]:
print(numerical_features)

In [None]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [None]:
print(f'There are {len(categorical_features)} categorical features:', '\n')

In [None]:
print(categorical_features)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

In [None]:
lasso = Lasso(alpha=0.1)

lasso_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

lasso_pipeline.fit(X_train, y_train)

preds = lasso_pipeline.predict(X_valid)

In [None]:
mean_absolute_error(y_valid, preds)

In [None]:
lasso_pipeline.score(X_valid, y_valid)

In [None]:
def run_training(data, model):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ('model', model)
    ])
    
    full_pipeline.fit(X_train, y_train)
    preds = full_pipeline.predict(X_valid)
    
    print(f"Mean absolute error: {mean_absolute_error(y_valid, preds)}")
    print(f"Score: {lasso_pipeline.score(X_valid, y_valid)}")

In [None]:
train = pd.read_csv('data/house_train.csv')
lasso = Lasso(alpha=0.1)
run_training(train, lasso)

Naloga: Dodajte feature selection v pipeline.

In [None]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__alpha': np.arange(0.01, 1, 0.05)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

search.fit(X_train, y_train)

In [None]:
print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

In [None]:
param_dict = {'model__alpha': np.arange(1, 100, 5)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=5, 
                      scoring='neg_mean_absolute_error')

search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

In [None]:
def run_training_gridcv(data, model, params):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ("select", SelectPercentile(score_func=f_regression, percentile=80)),
        ('model', model)
    ])
    
    
    search = GridSearchCV(full_pipeline, params, 
                      cv=5, 
                      scoring='neg_mean_absolute_error')
    
    search.fit(X_train, y_train)

    print('Best score:', abs(search.best_score_))
    print('Best params:', search.best_params_)
    return search
    
train = pd.read_csv('data/house_train.csv')
param_dict = {'model__alpha': np.arange(1, 300, 10)}
lasso = Lasso(alpha=0.1)
run_training_gridcv(train, lasso, param_dict)

In [None]:
lasso = Lasso(alpha=151)

final_lasso_pipe = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

final_lasso_pipe.fit(X_train, y_train)
preds = final_lasso_pipe.predict(X_valid)

mean_absolute_error(y_valid, preds)

In [None]:
preds_final = final_lasso_pipe.predict(X_test)

output = pd.DataFrame({'Id': X_test["Id"], 'SalePrice': preds_final})
output.head()