In [2]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


### Pipeline which allows to chain your data with a final estimator(model)

In [3]:
df = pd.read_csv("train.csv")

In [5]:
df.select_dtypes('number')

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,366,0,112,0,0,0,0,4,2010,142125


In [7]:
X = df.select_dtypes('number').drop("SalePrice", axis = 1)
y = df.SalePrice

In [8]:
pipe = make_pipeline(SimpleImputer(), RobustScaler(), LinearRegression())

In [9]:
cross_val_score(pipe, X, y)

array([0.84504187, 0.8134021 , 0.82235161, 0.81762266, 0.62915964])

In [12]:
cross_val_score(pipe, X, y).mean()

0.7855155758271254

- Only 5 lines of code we imported our training data, separated describing features from the target variable, setup a pipeline with an Imputer (that fills in missing values), a Scaler and a LinearRegression classifier.

In [14]:
#  work on our categorical data 

num_cols = df.drop("SalePrice", axis = 1).select_dtypes("number").columns
cat_cols = df.select_dtypes("object").columns

# we instantiate a first Pipeline, that processes our numerical values
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', RobustScaler())])


In [15]:
# the same we do for categorical data
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])


-  Encode categorical integer features using a one-hot aka one-of-K scheme. The input to this transformer should be a matrix of integers, denoting the values taken on by categorical (discrete) features. The output will be a sparse matrix where each column corresponds to one possible value of one feature.

In [16]:
preprocessor = ColumnTransformer(
                transformers = [
                    ('num', numeric_transformer, num_cols),
                    ('cat', categorical_transformer, cat_cols)
                ])

# Combines the two created pipelines
# each transformer gets the proper features according to num_cols and cat_cols

In [17]:
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearRegression())])

In [21]:
X = df.drop("SalePrice", axis = 1)
y = df.SalePrice
print ("The R2 Score is : ", cross_val_score(pipe,X,y).mean()) #R2 is the performance of the model.

The R2 Score is :  0.7939513453947364
