# Example: Pipelines usage

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/house_train.csv')
X_test = pd.read_csv('data/house_test.csv')

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
from sklearn.model_selection import train_test_split

X = train.drop('SalePrice', axis=1)
y = train["SalePrice"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)

In [6]:
X_train.describe().T.iloc[:10] # All numerical cols

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1022.0,728.62818,417.491868,1.0,374.5,734.5,1082.0,1459.0
MSSubClass,1022.0,57.030333,42.86121,20.0,20.0,50.0,70.0,190.0
LotFrontage,838.0,70.190931,24.110495,21.0,60.0,70.0,80.0,313.0
LotArea,1022.0,10472.601761,8782.768055,1491.0,7560.0,9571.0,11742.5,164660.0
OverallQual,1022.0,6.071429,1.374094,1.0,5.0,6.0,7.0,10.0
OverallCond,1022.0,5.578278,1.101703,1.0,5.0,5.0,6.0,9.0
YearBuilt,1022.0,1971.221135,29.863975,1875.0,1954.0,1973.0,2000.0,2009.0
YearRemodAdd,1022.0,1984.813112,20.67152,1950.0,1966.0,1994.0,2003.75,2010.0
MasVnrArea,1015.0,101.768473,180.299391,0.0,0.0,0.0,160.0,1600.0
BsmtFinSF1,1022.0,441.294521,438.43075,0.0,0.0,381.0,707.5,2260.0


In [7]:
X_train.describe(include="object").T.iloc[:10] # All object cols

Unnamed: 0,count,unique,top,freq
MSZoning,1022,5,RL,809
Street,1022,2,Pave,1017
Alley,67,2,Grvl,37
LotShape,1022,4,Reg,654
LandContour,1022,4,Lvl,920
Utilities,1022,2,AllPub,1021
LotConfig,1022,5,Inside,733
LandSlope,1022,3,Gtl,966
Neighborhood,1022,25,NAmes,156
Condition1,1022,9,Norm,881


In [8]:
above_0_missing = X_train.isnull().sum() > 0

In [9]:
X_train.isnull().sum()[above_0_missing]

LotFrontage      184
Alley            955
MasVnrType       622
MasVnrArea         7
BsmtQual          30
BsmtCond          30
BsmtExposure      31
BsmtFinType1      30
BsmtFinType2      31
Electrical         1
FireplaceQu      480
GarageType        58
GarageYrBlt       58
GarageFinish      58
GarageQual        58
GarageCond        58
PoolQC          1018
Fence            821
MiscFeature      988
dtype: int64

In [11]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()

In [12]:
print(f'There are {len(numerical_features)} numerical features:', '\n')

There are 37 numerical features: 



In [13]:
print(numerical_features)

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [14]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [15]:
print(f'There are {len(categorical_features)} categorical features:', '\n')

There are 43 categorical features: 



In [16]:
print(categorical_features)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [19]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [20]:
from sklearn.compose import ColumnTransformer

In [21]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [22]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

In [23]:
lasso = Lasso(alpha=0.1)

lasso_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

lasso_pipeline.fit(X_train, y_train)

preds = lasso_pipeline.predict(X_valid)

  model = cd_fast.enet_coordinate_descent(


In [24]:
mean_absolute_error(y_valid, preds)

19827.45940110741

In [26]:
lasso_pipeline.score(X_valid, y_valid)

0.7069796202127814

In [27]:
def run_training(data, model):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ('model', model)
    ])
    
    full_pipeline.fit(X_train, y_train)
    preds = full_pipeline.predict(X_valid)
    
    print(f"Mean absolute error: {mean_absolute_error(y_valid, preds)}")
    print(f"Score: {lasso_pipeline.score(X_valid, y_valid)}")

In [28]:
train = pd.read_csv('data/house_train.csv')
lasso = Lasso(alpha=0.1)
run_training(train, lasso)

Mean absolute error: 19827.45940110741
Score: 0.7069796202127814


  model = cd_fast.enet_coordinate_descent(


Naloga: Dodajte feature selection v pipeline.

In [33]:
from sklearn.feature_selection import SelectPercentile, f_regression

def run_training(data, model):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ("select",SelectPercentile(score_func=f_regression, percentile=80)),
        ('model', model)
    ])
    
    full_pipeline.fit(X_train, y_train)
    preds = full_pipeline.predict(X_valid)
    
    
    print(f"Mean absolute error: {mean_absolute_error(y_valid, preds)}")
    print(f"Score: {full_pipeline.score(X_valid, y_valid)}")
    return full_pipeline
    
train = pd.read_csv('data/house_train.csv')
lasso = Lasso(alpha=0.1)
run_training(train, lasso)

Mean absolute error: 19336.801941618458
Score: 0.7873358035432518


  model = cd_fast.enet_coordinate_descent(


In [34]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__alpha': np.arange(0.01, 1, 0.05)}

#print(param_dict)

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error',
                     n_jobs=-1)

search.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [35]:
print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

Best score: 18196.268521979895
Best alpha: {'model__alpha': 0.9600000000000001}


In [36]:
param_dict = {'model__alpha': np.arange(1, 100, 5)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=5, 
                      scoring='neg_mean_absolute_error',
                     n_jobs=-1)

search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

Best score: 16440.07047519717
Best alpha: {'model__alpha': 86}


In [37]:
def run_training_gridcv(data, model, params):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ("select", SelectPercentile(score_func=f_regression, percentile=80)),
        ('model', model)
    ])
    
    
    search = GridSearchCV(full_pipeline, params, 
                      cv=5, 
                      scoring='neg_mean_absolute_error',
                      n_jobs=-1)
    
    search.fit(X_train, y_train)

    print('Best score:', abs(search.best_score_))
    print('Best params:', search.best_params_)
    return search
    
train = pd.read_csv('data/house_train.csv')
param_dict = {'model__alpha': np.arange(1, 300, 10)}
lasso = Lasso(alpha=0.1)
run_training_gridcv(train, lasso, param_dict)

Best score: 16926.537259786382
Best params: {'model__alpha': 141}


In [43]:
lasso = Lasso(alpha=141)

final_lasso_pipe = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

final_lasso_pipe.fit(X_train, y_train)
preds = final_lasso_pipe.predict(X_valid)

mean_absolute_error(y_valid, preds)

17883.839490493858

In [44]:
preds_final = final_lasso_pipe.predict(X_test)

output = pd.DataFrame({'Id': X_test["Id"], 'SalePrice': preds_final})
output.head()

Unnamed: 0,Id,SalePrice
0,1461,108251.344922
1,1462,161502.3646
2,1463,183976.991508
3,1464,194509.651727
4,1465,200725.58286
