# Data Preparation

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)
%matplotlib inline

In [56]:
train = pd.read_csv('train.csv')

In [57]:
X = train.drop('Price', axis=1)
y = train['Price'].copy()

In [59]:
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Reservoir,2/17 Barton St,3,u,S,HAR,17/03/2018,12.0,3073.0,,,,,,,Darebin City Council,,,Northern Metropolitan,21650.0
1,Bundoora,3/2 Greenwood Dr,2,u,S,Barry,25/11/2017,12.1,3083.0,,,,,,,Banyule City Council,,,Northern Metropolitan,10175.0
2,Newport,29 Latrobe St,4,h,S,Raine&Horne,7/05/2016,8.4,3015.0,,,,,,,Hobsons Bay City Council,,,Western Metropolitan,5498.0
3,Nunawading,8 Haros Av,3,h,S,Fletchers,26/08/2017,15.4,3131.0,3.0,1.0,1.0,,101.0,1950.0,Manningham City Council,-37.82658,145.17577,Eastern Metropolitan,4973.0
4,Fitzroy,17 Bell St,4,h,S,Nelson,27/11/2016,1.6,3065.0,4.0,3.0,2.0,286.0,275.0,1890.0,Yarra City Council,-37.7999,144.9755,Northern Metropolitan,5825.0


## Simple Imputer and Encoder
We will use median values to impute numerical variables and One Hot Encoding for categorical variables. Below is the code to confirm the functionality.

In [60]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

imputer = SimpleImputer(strategy='median')
ohe = OneHotEncoder()

In [61]:
# Creating a copy of the training dataset to test on safely
housing = X.copy()

In [62]:
# Fitting the median values imputer on the numerical columns
housing_num = housing.select_dtypes(exclude='object')
imputer.fit(housing_num)
housing_num_imp = imputer.transform(housing_num)
housing_num_tr = pd.DataFrame(housing_num_imp, columns=housing_num.columns, index=housing_num.index)

In [63]:
housing_num_tr.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,3.0,12.0,3073.0,3.0,1.0,2.0,510.0,133.0,1970.0,-37.800395,145.003085,21650.0
1,2.0,12.1,3083.0,3.0,1.0,2.0,510.0,133.0,1970.0,-37.800395,145.003085,10175.0
2,4.0,8.4,3015.0,3.0,1.0,2.0,510.0,133.0,1970.0,-37.800395,145.003085,5498.0
3,3.0,15.4,3131.0,3.0,1.0,1.0,510.0,101.0,1950.0,-37.82658,145.17577,4973.0
4,4.0,1.6,3065.0,4.0,3.0,2.0,286.0,275.0,1890.0,-37.7999,144.9755,5825.0


In [77]:
# One Hot Encoding the categorical variables
housing_cat = housing.select_dtypes(include='object')
housing_cat_ohe = ohe.fit_transform(housing_cat)
housing_cat_ohe

<21797x22275 sparse matrix of type '<class 'numpy.float64'>'
	with 174376 stored elements in Compressed Sparse Row format>

## Custom Transformers
We will use custom transformers to preprocess the columns in the dataset in order to feed into our Pipeline

In [65]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from datetime import datetime

In [66]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, feature_names):
        self._feature_names = feature_names 
      
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y = None):
        return X[self._feature_names]

In [67]:
# Customer Transformer for numerical variables
YEAR = datetime.now().year

class NumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, add_building_age=True): 
        self._add_building_age = add_building_age
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self._add_building_age:
            X.loc[:, 'BuildingAge'] = X['YearBuilt']
        
        return X.values

In [68]:
# Customer Transformer for categorical variables
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, use_dates = ['year', 'month', 'day']):
        self._use_dates = use_dates
        
    def fit(self, X, y=None):
        return self
    
    def get_year(self, obj):
        return obj.split('/')[2]
    
    def get_month(self, obj):
        return obj.split('/')[1]
        
    def get_day(self, obj):
        return obj.split('/')[0]
        
    def transform(self, X, y=None):
        print(type(X))
        for spec in self._use_dates:
            exec("X.loc[:,'{}'] = X['Date'].apply(self.get_{})".format(spec, spec))
    
        X.drop('Date', axis=1)
        
        return X.values

We also need to grab the column names of each of the numerical and categorical features to use in the Pipeline

In [69]:
numerical_features = X.select_dtypes(exclude='object').columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

## Preprocessing Pipeline (using FeatureUnion)

In [119]:
# Numerical pipeline
num_pipeline = Pipeline([
    ('num_selector', FeatureSelector(numerical_features)),
    ('numerical_transformer', NumericalTransformer()),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

In [120]:
# Categorical and Date pipeline
cat_pipeline = Pipeline([
    ('cat_selector', FeatureSelector(categorical_features)),
    ('categorical_transformer', CategoricalTransformer()),
    ('encoder', OneHotEncoder()), 
])

In [121]:
full_pipeline = FeatureUnion([
    ('cat_pipeline', cat_pipeline ),
    ('num_pipeline', num_pipeline)
])

In [122]:
# Testing the components of the pipeline
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr

array([[ 0.00789241,  0.10596734, -0.36794961, ...,  0.04332939,
         3.14468544,  0.06056663],
       [-1.03471837,  0.12071815, -0.27861336, ...,  0.04332939,
         0.58464378,  0.06056663],
       [ 1.05050319, -0.42506177, -0.88609986, ...,  0.04332939,
        -0.45878235,  0.06056663],
       ...,
       [ 2.09311397,  0.35673109,  0.65941727, ...,  0.22724383,
        -0.46859863, -0.74977667],
       [ 1.05050319,  0.51898998, -0.81463086, ..., -2.0714275 ,
        -1.08880916,  0.06056663],
       [-1.03471837, -1.25110705, -1.02010424, ..., -0.25716315,
         2.2179392 ,  1.84332189]])

In [123]:
# Testing the components of the pipeline
housing_cat_tr = cat_pipeline.fit_transform(housing_cat)
housing_cat_tr

<class 'pandas.core.frame.DataFrame'>


<21797x22318 sparse matrix of type '<class 'numpy.float64'>'
	with 239767 stored elements in Compressed Sparse Row format>

In [124]:
# Applying the full pipeline using FeatureUnion
X_final = full_pipeline.fit_transform(X)

<class 'pandas.core.frame.DataFrame'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [None]:
# Export the preprocessed model to csv