In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [3]:
holdout = pd.read_csv('kc_house_data_test_features.csv')

In [8]:
import pickle

final_scaler = pd.read_pickle("kc_house_scaler.pickle")
final_model = pd.read_pickle("kc_house_model.pickle")

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [13]:
def engineering(dataframe):
    dataframe.drop(columns = 'Unnamed: 0', inplace = True)
    dataframe['date'] = dataframe['date'].map(lambda x: x[:8]) # Removes 'T000000' from end of each date
    dataframe['date'] = pd.to_datetime(dataframe['date'], format = '%Y%M%d', errors = 'coerce')


    dataframe['bedrooms'] = np.where((dataframe['bedrooms'] + 1)/(dataframe['bathrooms'] + 1) > 4, 
                                      dataframe['bedrooms'].median(), 
                                      dataframe['bedrooms']) # Taking care of outliers in bedrooms

    dataframe['bathrooms'] = np.where((dataframe['bathrooms'] + 1)/(dataframe['bedrooms'] + 1) > 4,
                                       dataframe['bathrooms'].median(), 
                                       dataframe['bathrooms']) # Taking care of outliers in bathrooms

    dataframe['sale_yr'] = dataframe['date'].dt.year # New column using only year component of date column
    
    dataframe['home_age'] = dataframe['sale_yr'] - dataframe['yr_built']
    
    dataframe['home_age'] = np.where(dataframe['home_age'] == -1, 
                                     0, 
                                     dataframe['home_age']) # Removing rows where year built was after year sold

    dataframe['yr_renovated'] = np.where(dataframe['yr_renovated'] > dataframe['sale_yr'], 
                                         dataframe['sale_yr'], 
                                         dataframe['yr_renovated']) # Removing rows where year renovated was after year sold
        
    dataframe['yrs_since_reno'] = np.where(dataframe['yr_renovated'] != 0, 
                                           dataframe['sale_yr'] - dataframe['yr_renovated'], 
                                           dataframe['home_age']) # Creates yrs_since_reno col depending on if house was renovated

    ohe = pd.get_dummies(dataframe['zipcode']) #step 2 of engineering/cleaning before adding nnp cols
    dataframe = dataframe.join(ohe)
    
    dataframe['sqrt_grade'] = np.sqrt(dataframe['grade'])
    
    dataframe = dataframe.drop(columns = ['id', 'date', 'zipcode', 'lat', 'long', 
                                'yr_renovated', 'sale_yr', 'yr_built'])
    
    
    return dataframe

In [12]:
# transformed_holdout = engineering(holdout)
transformed_holdout = transformed_holdout.drop(columns = ['id', 'date', 'zipcode', 'lat', 'long', 
                                'yr_renovated', 'sale_yr', 'yr_built'])
transformed_holdout.shape

(4323, 85)

In [9]:
transformed_holdout = final_scaler.transform(transformed_holdout)

ValueError: operands could not be broadcast together with shapes (4323,85) (86,) (4323,85) 

## Step 3: Predict the holdout set

In [6]:
# final_answers = final_model.predict(transformed_holdout)

## Step 4: Export your predictions

In [7]:
# final_answer.to_csv('housing_preds_your_name.csv')