# Authenticate to Kaggle

In [2]:
!mkdir ~/.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 87.2MB/s]


In [7]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Pipeline Practise

In [12]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [10]:
df = pd.read_csv('train.csv')

In [11]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [16]:
# Only certain columns so I don't need to deal with NaNs for now
select_df = df[['MSSubClass',	'MSZoning',	'LotFrontage',	'LotArea',	'Street', 'LotShape',	'LandContour',	'Utilities', 'MiscVal',	'MoSold',	'YrSold',	'SaleType',	'SaleCondition',	'SalePrice']].dropna()

In [24]:
# Only certain columns so I don't need to deal with NaNs for now
X = pd.get_dummies(select_df.drop(columns='SalePrice', axis=1))
y= select_df['SalePrice']

In [25]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,0,2,2008,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False
1,20,80.0,9600,0,5,2007,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False
2,60,68.0,11250,0,9,2008,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False
3,70,60.0,9550,0,2,2006,False,False,False,True,...,False,False,False,True,True,False,False,False,False,False
4,60,84.0,14260,0,12,2008,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False


In [26]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [27]:
pipeline.fit(X,y)

In [28]:
pipeline.predict(X)

array([201838.5 , 162771.32, 218038.5 , ..., 218806.75, 146789.5 ,
       152271.5 ])

# Save the Pipeline

In [29]:
import pickle

In [98]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [99]:
with open('pipelinemodel.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [100]:
reloaded_model

In [36]:
reloaded_model.named_steps['randomforestregressor']

# Using the Pipeline Class

In [38]:
from sklearn.pipeline import Pipeline

In [39]:
# With the pipeline class
custom_pipeline = Pipeline([('scaling',StandardScaler()),('rfmodel',RandomForestRegressor())])

In [46]:
custom_pipeline.steps

[('scaling', StandardScaler()), ('rfmodel', RandomForestRegressor())]

In [44]:
# With the make_pipeline class
make_pipeline_model = make_pipeline(StandardScaler(),RandomForestRegressor())

In [47]:
make_pipeline_model.steps

[('standardscaler', StandardScaler()),
 ('randomforestregressor', RandomForestRegressor())]

# Column Transformers

In [101]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [102]:
select_df.select_dtypes('object').columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [103]:
# Numeric features
numeric_features = select_df.drop(columns='SalePrice',axis=1).select_dtypes(exclude='object').columns
numeric_pipeline = Pipeline([('scaling',StandardScaler())])

In [104]:
# Categorical features
categorical_features = select_df.select_dtypes('object').columns
categorical_pipeline = Pipeline([('ohe',OneHotEncoder())])

In [105]:
transformer = ColumnTransformer([
    ('numeric_preprocessing',numeric_pipeline,numeric_features),
    ('categorical_preprocessing',categorical_pipeline,categorical_features)
])

In [106]:
transformer.transformers

[('numeric_preprocessing',
  Pipeline(steps=[('scaling', StandardScaler())]),
  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
 ('categorical_preprocessing',
  Pipeline(steps=[('ohe', OneHotEncoder())]),
  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
         'SaleType', 'SaleCondition'],
        dtype='object'))]

In [107]:
ml_pipeline = Pipeline([('all_columns_transform',transformer),
                        ('randforestreg',RandomForestRegressor())])

In [108]:
X = select_df.drop(columns='SalePrice', axis=1)
y = select_df['SalePrice']

In [109]:
ml_pipeline.fit(X,y)

In [110]:
ml_pipeline.predict(X)

array([198880.5, 163713. , 220876. , ..., 224003.5, 147585. , 153027. ])

# Save the model as pickle

In [126]:
with open('columntransformermodel.pkl','wb') as f:
  pickle.dump(ml_pipeline,f)

In [127]:
with open('columntransformermodel.pkl','rb') as f:
  reloaded_ml_pipeline = pickle.load(f)

In [128]:
reloaded_ml_pipeline