In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
sales_raw = pd.read_csv('../data/kc_house_data.csv')
sales_raw.head()

In [None]:
X, y = sales_raw.drop('price', axis=1), sales_raw['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [None]:
date_idx = 1

class YearAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
    def fit(self, X, y=None):
        # TODO: think this is the best place to put get_feature_names
        return self
    def transform(self, X):
        yr_sold = pd.to_datetime(X[:,date_idx]).year
        return np.c_[X, yr_sold]
#     def get_feature_names(self):
#         return list(X.columns).append('yr_sold')

In [None]:
# Need this step in order to compute sqft_per_bedroom without dividing by zero,
# there are only about ten instances of zero bedrooms since the dataset is single-family
# home sales

BedroomImputer = SimpleImputer(missing_values=0, strategy='constant', fill_value=1)
bedroom_pipe = ColumnTransformer([
    # original index of bedrooms col is 2, returns transformed bedrooms col
    # and then all other columns in original order
    ('bedroom_imputer', BedroomImputer, [2]),
], remainder='passthrough')

In [None]:
# column indices after the first two pipeline steps from above
bedrooms_idx, sqft_living_idx, yr_sold_idx, yr_built_idx, yr_renovated_idx = 0, 4, -1, 13, 14

# add attributes, a custom sklearn transformer needs init, fit, and transform methods
# fit_transform is created by adding TransformerMixin and BaseEstimator gives get/
# set_params()
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_sqft_per_bedroom = True):
        self.add_sqft_per_bedroom = add_sqft_per_bedroom
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Subtract the larger of yr_built or yr_renovated from yr_sold,
        # if this is negative (yr_reno after yr_sold) return 0
        effective_age_guess = X[:, yr_sold_idx] - np.maximum(X[:, yr_renovated_idx], X[:, yr_built_idx])
        effective_age = np.maximum(effective_age_guess, 0)
        if self.add_sqft_per_bedroom:
            sqft_per_bedroom = X[:, sqft_living_idx] / X[:, bedrooms_idx]
            return np.c_[X, effective_age, sqft_per_bedroom]
        else:
            return np.c_[X, effective_age]
        
def drop_columns(X):
    # drop id, date, yr_renovated, zipcode
    return X[:,[0, *range(3, 14), *range(16, 23)]]

In [None]:
# the feature names in order resulting from 'features_pipe' below
features = list(X_train.columns[2:])
features.remove('zipcode')
features.remove('yr_renovated')
features += ['yr_sold', 'effective_age', 'sqft_per_bedroom']

In [None]:
# pipeline of the above feature engineering steps
features_pipe = Pipeline([
    ('yr_sold_adder', DateAdder()),
    ('bedroom_pipe', bedroom_pipe),
    ('feature_adder', FeatureAdder()),
    ('column_dropper', FunctionTransformer(drop_columns)),
    
])

In [1]:
# preprocessing steps
cat_indices = [*range(5,9)]


num_indices = [x for x in range(0, 17) if x not in cat_indices]
cat_indices.remove(5)

print(cat_indices)
num_indices

In [1]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', QuantileTransformer(output_distribution='normal'))
])

cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])


# Waterfront has only 0 and 1 so is effectively already one-hotted (only column in 'passthrough')
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_indices),
    ('cat', cat_pipeline, cat_indices)
    ], remainder='passthrough')

In [1]:
prepared = preprocessor.fit_transform(test_res)

[6, 7, 8]


[0, 1, 2, 3, 4, 9, 10, 11, 12, 13, 14, 15, 16]

In [345]:
cat_features = ['waterfront'] + list(preprocessor.transformers_[2][1]['one_hot']\
                   .get_feature_names(features[6:9]))
cat_features

['waterfront',
 'view_0',
 'view_1',
 'view_2',
 'view_3',
 'view_4',
 'condition_1',
 'condition_2',
 'condition_3',
 'condition_4',
 'condition_5',
 'grade_3',
 'grade_4',
 'grade_5',
 'grade_6',
 'grade_7',
 'grade_8',
 'grade_9',
 'grade_10',
 'grade_11',
 'grade_12',
 'grade_13']

In [346]:
num_features = [features[idx] for idx in num_indices]
num_features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
 'yr_sold']

In [347]:
features_prepared = num_features + cat_features

In [348]:
features_prepared

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
 'yr_sold',
 'waterfront',
 'view_0',
 'view_1',
 'view_2',
 'view_3',
 'view_4',
 'condition_1',
 'condition_2',
 'condition_3',
 'condition_4',
 'condition_5',
 'grade_3',
 'grade_4',
 'grade_5',
 'grade_6',
 'grade_7',
 'grade_8',
 'grade_9',
 'grade_10',
 'grade_11',
 'grade_12',
 'grade_13']