# Authenticate to Kaggle

In [1]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [2]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [3]:
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

house-prices-advanced-regression-techniques.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
replace data_description.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data_description.txt    
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sample_submission.csv   
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


# Pipeline Practice

In [28]:
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [7]:
 df = pd.read_csv('train.csv')

In [8]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
pd.set_option('display.max_rows', None)
# df.isnull().sum()

In [10]:
select_df = df[['MSSubClass',	'MSZoning',	'LotFrontage',	'LotArea',	'Street', 'LotShape',	'LandContour',	'Utilities', 'MiscVal',	'MoSold',	'YrSold',	'SaleType', 'SalePrice']].dropna()

In [11]:
X = pd.get_dummies(select_df.drop('SalePrice', axis = 1))
y = select_df.SalePrice

In [12]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,...,Utilities_AllPub,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,60,65.0,8450,0,2,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,20,80.0,9600,0,5,2007,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,60,68.0,11250,0,9,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,70,60.0,9550,0,2,2006,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,60,84.0,14260,0,12,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [13]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [14]:
# X.dropna(inplace= True)

In [15]:
# y.dropna(inplace = True)

In [17]:
pipeline.fit(X,y)

In [19]:
pipeline.predict(X)

array([203331.75      , 167044.83333333, 221511.5       , ...,
       223824.93      , 147695.        , 158609.25      ])

In [20]:
import pickle

In [21]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [23]:
with open('pipelinemodel.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [24]:
reloaded_model

In [27]:
reloaded_model.steps[1][1].predict(X)



array([326142.5, 326142.5, 382197.5, ..., 326142.5, 326142.5, 326142.5])

# Using the Pipeline Class

In [29]:
custom_pipeline = Pipeline([('scaling', StandardScaler()), ('rfmodel', RandomForestRegressor())])

In [31]:
custom_pipeline

In [30]:
make_pipeline_model = make_pipeline(StandardScaler(), RandomForestRegressor())

In [32]:
make_pipeline_model

# Column Transformers

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [37]:
pd.reset_option('all')

  pd.reset_option('all')
  pd.reset_option('all')
: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.

  pd.reset_option('all')


In [40]:
# pd.set_option('display.max_rows', None)
select_df.select_dtypes('int').columns

Index(['MSSubClass', 'LotArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'], dtype='object')

In [74]:
# Separating num and cat vars

# Categorical Features

cat_features = select_df.select_dtypes('object').columns
num_features = select_df.drop('SalePrice', axis= 1).select_dtypes(exclude = 'object').columns

In [75]:
num_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')

In [76]:
num_pipeline = Pipeline([(('scaler'), StandardScaler())])
cat_pipeline = Pipeline([('onehot', OneHotEncoder())])

In [77]:
X = select_df.drop('SalePrice', axis = 1)
y = select_df['SalePrice']

In [78]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)

In [79]:
preprocessor

In [82]:
ml_pipeline = Pipeline([('all_column_prepro', preprocessor), ('randomforestclassifier', RandomForestRegressor())])

In [83]:
ml_pipeline.fit(X, y)

In [85]:
ml_pipeline.predict(X)

array([204211.5       , 167681.83333333, 216744.62      , ...,
       230889.25      , 143606.75      , 153714.5       ])

In [86]:
with open('columntransformermodel.pkl', 'wb') as f:
  pickle.dump(ml_pipeline, f)

In [87]:
with open('columntransformermodel.pkl', 'rb') as f:
  reloaded_ml_pipeline = pickle.load(f)

In [88]:
reloaded_ml_pipeline