# Running Predictions on Unseen Data

## Import Libraries

In [9]:
import pandas as pd
from pathlib import Path
import numpy as np
import warnings
import pickle
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 300)

## Run Data Cleaning

The testing set must have identical data cleaning to the initial set.

In order to do so, I run the same data cleaning in the cell below.

In [21]:
df_clean = pd.read_csv('../data/kc_house_data_test_features.csv', index_col = 0)

# Removing the time variable from date
df_clean['date'] = df_clean['date'].map(lambda x: x[:8])

# Altering the cleaned date into datetime type
df_clean['date'] = pd.to_datetime(df_clean['date'])

#Create a filepath to the folder containing king_county data
files = Path('../data/').rglob('king_county*.csv')

#Create a list with all csv information
    #Set zip codes as index for later concatenation
    #Alter csv's NaN values from '0' to NaN
all_csvs = [pd.read_csv(file,
            index_col = 'Zip Code Tabulation Area',
            usecols = [0,3],
            na_values = 0,
            skipfooter = 11)\
            for file in files]

#Concatenate all csv's using zip code as index
#Remove zip code as index and rename columns
king_county_df = pd.concat(all_csvs, axis = 1).reset_index()\
.rename(columns = {'Zip Code Tabulation Area': 'zipcode',
                    'Estimated number of housing units, between 2015-2019.': 'number_of_housing_units',
                    'Estimated median value of an owner-occupied home, between 2015-2': 'median_value_occupied_home',
                    'Estimated median income of a household, between 2015-2019.': 'household_median_income',
                    'Estimated number of people per square mile, between 2015-2019.': 'people_per_square_mile',
                    'Estimated percent of all people that are living in poverty as of': 'poverty_rate'})

#Drop zip codes not within housing data
king_county_df.drop(king_county_df[king_county_df['zipcode'].isin(list(df_clean['zipcode'])) != True].index,
                    inplace = True)
king_county_df.reset_index(drop = True, inplace = True)

#Attach to main dataframe
df_clean = df_clean.merge(king_county_df, how = 'left', on = 'zipcode')

#Conditional set to 16, the 2nd largest value + 5 standard deviations
df_clean['bedrooms'] = np.where(df_clean['bedrooms'] > 16,
                                #Average bedrooms rounded to the nearest whole number
                                round(df_clean['bedrooms'].mean(), 0)
                                , df_clean['bedrooms'])

#Binning grade variable
df_clean['below_average_grade'] = np.where(df_clean['grade'] < 7, 1, 0)
df_clean['average_grade'] = np.where(df_clean['grade'] == 7, 1, 0)
df_clean['above_average_grade'] = np.where(df_clean['grade'].isin(range(8,11)), 1, 0)
df_clean['high_grade'] = np.where(df_clean['grade'] > 10, 1, 0)

#Binning bathroom variable
df_clean['bath_5_plus'] = np.where(df_clean['bathrooms'] >= 5.5, 1, 0)
df_clean['bath_3_5'] = np.where((df_clean['bathrooms'] >= 3.75) & (df_clean['bathrooms'] < 5.5), 1, 0)
df_clean['bath_2_3'] = np.where((df_clean['bathrooms'] >= 2.75) & (df_clean['bathrooms'] < 3.75), 1, 0)
df_clean['bath_1_2'] = np.where((df_clean['bathrooms'] >= 1.25) & (df_clean['bathrooms'] < 2.75), 1, 0)
df_clean['bath_1_minus'] = np.where(df_clean['bathrooms'] < 1.25, 1, 0)


#Feature interactions
df_clean['sqft_x_median_value'] = df_clean['sqft_living'] * df_clean['median_value_occupied_home']
df_clean['floors_x_view'] = df_clean['floors'] * df_clean['view']

#Create a dummy variable to mark homes with the following conditions
df_clean['waterfront_wealth'] = np.where((df_clean['waterfront'] == 1) &
                                         (df_clean['long'] > -122.4) &
                                         (df_clean['lat'] > 47.5),
                                         1, 0)

#Dummy the zipcodes
df_clean = df_clean.merge((pd.get_dummies(df_clean['zipcode'], drop_first = True)),
                          how = 'left', left_index = True, right_index = True)

## Load Pickle Data

I now load the data I previously exported.

In [14]:
infile = open("pickle/model.pickle",'rb')
model = pickle.load(infile)
infile.close()

In [15]:
#Check length to ensure it exported/loaded correctly
print(len(model.coef_))

103


In [16]:
infile = open("pickle/other_info.pickle",'rb')
other_info = pickle.load(infile)
infile.close()

In [17]:
#Brief check of columns
other_info

Index([   'bedrooms',   'bathrooms', 'sqft_living',    'sqft_lot',
            'floors',  'waterfront',        'view',   'condition',
             'grade',  'sqft_above',
       ...
               98146,         98148,         98155,         98166,
               98168,         98177,         98178,         98188,
               98198,         98199],
      dtype='object', length=103)

In [22]:
final_answers = model.predict(df_clean[other_info])

In [32]:
#Ensure number of predictions match expectations
len(final_answers) == len(df_clean)

True

Finally, I export the predictions as a csv to be checked against their actual values.

In [29]:
df_final = pd.DataFrame(final_answers)

In [30]:
df_final.to_csv('housing_preds_isana_mizuma.csv')