In [22]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# potential machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# to save machine learning Models
import pickle

In [7]:
data = pd.read_csv('kc_house_data.csv')
print(data.shape)

(21613, 21)


As you can see, we put BaseEstimator and TransformerMixin in parenthesis while declaring the class to let Python know our class is going to inherit from them. Like all the constructors we’re going to write , the fit method only needs to return self. The transform method is what we’re really writing to make the transformer do what we need it to do. In this case it simply means returning a pandas data frame with only the selected columns.

In [2]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

<h2> Categorical pipeline

Below is a list of features our custom transformer will deal with and how, in our categorical pipeline. <br>
> - date : The dates in this column are of the format ‘YYYYMMDDT000000’ and must be cleaned and processed to be used in any meaningful way. The constructor for this transformer will allow us to specify a list of values for the parameter ‘use_dates’ depending on if we want to create a separate column for the year, month and day or some combination of these values or simply disregard the column entirely by passing in an empty list. By not hard coding the specifications for this feature, we give ourselves the ability to try out different combinations of values whenever we want without having to rewrite code. <br>
> - waterfront : Wether the house is waterfront property or not. Convert to binary — Yes or No <br>
> - view : How many times the house has been viewed. Most of the values are 0. The rest are very thinly spread between 1 and 4. Convert to Binary — Yes or No <br>
> - yr_renovated : The year the house was renovated in. Most of the values are 0, presumably for never while the rest are very thinly spread between some years. Convert to Binary — Yes or No <br>
> - Once all these features are handled by our custom transformer in the aforementioned way, they will be converted to a Numpy array and pushed to the next and final transformer in the categorical pipeline. A simple scikit-learn one hot encoder which returns a dense representation of our pre-processed data.  <br>

In [3]:
#Custom transformer that breaks dates column into year, month and day into separate columns and
#converts certain features to binary 
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, use_dates = ['year', 'month', 'day'] ):
        self._use_dates = use_dates
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self

    #Helper function to extract year from column 'dates' 
    def get_year( self, obj ):
        return str(obj)[:4]
    
    #Helper function to extract month from column 'dates'
    def get_month( self, obj ):
        return str(obj)[4:6]
    
    #Helper function to extract day from column 'dates'
    def get_day(self, obj):
        return str(obj)[6:8]
    
    #Helper function that converts values to Binary depending on input 
    def create_binary(self, obj):
        if obj == 0:
            return 'No'
        else:
            return 'Yes'
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
       #Depending on constructor argument break dates column into specified units
       #using the helper functions written above 
       for spec in self._use_dates:
        
        exec( "X.loc[:,'{}'] = X['date'].apply(self.get_{})".format( spec, spec ) )
        
       #Drop unusable column 
       X = X.drop('date', axis = 1 )
       
       #Convert these columns to binary for one-hot-encoding later
       X.loc[:,'waterfront'] = X['waterfront'].apply( self.create_binary )
       
       X.loc[:,'view'] = X['view'].apply( self.create_binary )
       
       X.loc[:,'yr_renovated'] = X['yr_renovated'].apply( self.create_binary )
       #returns numpy array
       return X.values 


<h2> Numerical pipeline

Below is a list of features our custom numerical transformer will deal with and how, in our numerical pipeline. <br>
> - bathrooms : Number of bathrooms in the house. The constructor for this transformer will have a parameter ‘bath_per_bead’ that takes in a Boolean value. If True, then the constructor will create a new column by computing bathrooms/bedrooms to calculate the number of bathrooms per bedroom and drop the original bathroom column. If False, then it will just pass the bathroom column as it is. <br>
> - yr_built : The year the house was built in. The constructor for this transformer will have another parameter ‘years_old’ that also takes in a Boolean value. If True, then the constructor will create a new column by computing the age of the house in 2019 by the subtracting the year it was built in from 2019 and it will drop the original yr_built column. If False, then it will just pass the yr_built column as it is. <br><br>
Once all these features are handled by our custom numerical transformer in the numerical pipeline as mentioned above, the data will be converted to a Numpy array and passed to the next step in the numerical pipeline, an Imputer which is another kind of scikit-learn transformer. The Imputer will compute the column-wise median and fill in any Nan values with the appropriate median values. From there the data would be pushed to the final transformer in the numerical pipeline, a simple scikit-learn Standard Scaler. 

In [4]:
#Custom transformer we wrote to engineer features ( bathrooms per bedroom and/or how old the house is in 2019  ) 
#passed as boolen arguements to its constructor
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, bath_per_bed = True, years_old = True ):
        self._bath_per_bed = bath_per_bed
        self._years_old = years_old
        
    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        #Check if needed 
        if self._bath_per_bed:
            #create new column
            X.loc[:,'bath_per_bed'] = X['bathrooms'] / X['bedrooms']
            #drop redundant column
            X.drop('bathrooms', axis = 1 )
        #Check if needed     
        if self._years_old:
            #create new column
            X.loc[:,'years_old'] =  2019 - X['yr_built']
            #drop redundant column 
            X.drop('yr_built', axis = 1)
            
        #Converting any infinity values in the dataset to Nan
        X = X.replace( [ np.inf, -np.inf ], np.nan )
        #returns a numpy array
        return X.values


<h2> Combining pipelines

We can create a feature union class object in Python by giving it two or more pipeline objects consisting of transformers. Calling the fit_transform method for the feature union object pushes the data down the pipelines separately and then results are combined and returned. In our case since the first step for both of our pipelines is to extract the appropriate columns for each pipeline, combining them using feature union and fitting the feature union object on the entire dataset means that the appropriate set of columns will be pushed down the appropriate set of pipelines and combined together after they are transformed!

In [8]:
#Categrical features to pass down the categorical pipeline 
categorical_features = ['date', 'waterfront', 'view', 'yr_renovated']

#Numerical features to pass down the numerical pipeline 
numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'condition', 'grade', 'sqft_basement', 'yr_built']

#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                  
                                  ( 'cat_transformer', CategoricalTransformer() ), 
                                  
                                  ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
                                  ( 'num_transformer', NumericalTransformer() ),
                                  
                                  ('imputer', SimpleImputer(strategy = 'median') ),
                                  
                                  ( 'std_scaler', StandardScaler() ) ] )

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )

<h2> Running the data

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

#Leave it as a dataframe becuase our pipeline is called on a 
#pandas dataframe to extract the appropriate columns, remember?
X = data.drop('price', axis = 1)
#You can covert the target variable to numpy 
y = data['price'].values 

X_train, X_test, y_train, y_test = train_test_split( X, y , test_size = 0.2 , random_state = 42 )

#The full pipeline as a step in another pipeline with an estimator as the final step
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  
                                  ( 'model', LinearRegression() ) ] )

#Can call fit on it just like any other pipeline
full_pipeline_m.fit( X_train, y_train )

#Can predict with it like any other pipeline
y_pred = full_pipeline_m.predict( X_test ) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [10]:
y_pred.shape

(4323,)

In [11]:
def convert_features_to_array(features):
    # function to convert feature df
    # to an array
    num_rows = len(features)
    num_cols = len(features.columns)
    
    features_array = (np
                      .array(features)
                      .reshape((num_rows, 
                                num_cols)))

    return features_array

def convert_target_to_array(target):
    # function to convert target df
    # to an array
    target_array = (np
                    .array(target)
                    .reshape((-1, )))
    return target_array

In [12]:
X_train_array = convert_features_to_array(X_train)
X_valid_array = convert_features_to_array(X_test)
y_train_array = convert_target_to_array(y_train)
y_valid_array = convert_target_to_array(y_test)
print(X_train_array.shape,X_valid_array.shape,y_train_array.shape,y_valid_array.shape)


(17290, 20) (4323, 20) (17290,) (4323,)


In [27]:
def fit_evaluate_model(model):
    # function to train a given model
    # return mean squared error of the
    # actuals and predictions
    # The full pipeline as a step in another pipeline with an estimator as the final step
    full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),                                  
                                          ( 'model', model ) ] )

# Can call fit on it just like any other pipeline
    full_pipeline_m.fit( X_train, y_train )

# Can predict with it like any other pipeline
    y_pred = full_pipeline_m.predict( X_test ) 

    return y_pred, mean_squared_error(y_test, y_pred)

In [23]:
%%time
lr = LinearRegression()
mse_lr = fit_evaluate_model(lr)
print(f'The MSE for Linear regression was {mse_lr:.2f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

The MSE for Linear regression was 52595747909.46
CPU times: user 356 ms, sys: 71.8 ms, total: 428 ms
Wall time: 234 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [24]:
%%time
knn = KNeighborsRegressor()
mse_knn = fit_evaluate_model(knn)
print(f'The MSE for KNN regression was {mse_knn:.2f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

The MSE for KNN regression was 60772076637.54
CPU times: user 6.14 s, sys: 33.8 ms, total: 6.18 s
Wall time: 6.18 s


In [25]:
%%time
svm = SVR()
mse_svm = fit_evaluate_model(svm)
print(f'The MSE for SVM regression was {mse_svm:.2f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

The MSE for SVM regression was 160771312138.35
CPU times: user 36.8 s, sys: 300 ms, total: 37.1 s
Wall time: 37.2 s


In [28]:
%%time
rf = RandomForestRegressor()
y_pred_rf, mse_rf = fit_evaluate_model(rf)
print(f'The MSE for Random forest regression was {mse_rf:.2f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

The MSE for Random forest regression was 45204722052.37
CPU times: user 18.1 s, sys: 164 ms, total: 18.3 s
Wall time: 18.3 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [37]:
def PlotPredictedVSActual(predictions, actuals):
    %pylab
    figsize(10, 5)
    plt.rcParams['font.size'] = 14

    # histogram of predictions
    ax = plt.subplot(121)
    ax.hist(predictions,
            bins=10, 
            color = "#971539", 
            edgecolor = 'white')
    ax.set_xlabel("points", size=14)
    ax.set_xticks(range(80, 101))
    ax.set_ylabel("count", size=14)
    ax.set_title("Predicted Distribution", size=16)
    plt.grid(b=True, axis = 'y', alpha=0.3)

    # histogram of actual values
    ax2 = plt.subplot(122)
    ax2.hist(actuals,
             bins=20, 
             color = "#971539", 
             edgecolor = 'white')
    ax2.set_xlabel("points", size=14)
    ax2.set_xticks(range(80, 101))
    ax2.set_ylabel("count", size=14)
    ax2.set_title("Actual Distribution", size=16)
    plt.grid(b=True, axis = 'y', alpha=0.3)

    # plt.savefig("PredictionsVSActuals.png")

In [38]:
PlotPredictedVSActual(y_pred_rf, y_test)

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [40]:
def PlotFeatureImportances(model, feature_names):
    feature_importances = (pd
                           .DataFrame(
                               {'feature': feature_names,
                                'importance': model
                                .feature_importances_}))

    feature_importances = (feature_importances
                           .sort_values(by="importance",
                                        ascending=False))
    figsize(20, 10)
    plt.rcParams['font.size'] = 14
    sns.set(font_scale=1.5, style="whitegrid")

    # set color
    labels = np.array(feature_importances.feature)
    values = np.array(feature_importances.importance) 
    colors = ["#808080" if (y < max(values))
              else "#971539" for y in values]

    # set the plot
    ax = sns.barplot(x="importance", 
                     y="feature", 
                     data=feature_importances, 
                     palette = colors)

    # set title and save plot
    plt.title("Feature Importances", size =16)
    # plt.savefig("FeatureImportances.png")

In [43]:
PlotFeatureImportances(rf, X_test.columns)

ValueError: arrays must all be same length