In [1]:
import matplotlib.pyplot as plt
from math import exp
from scipy.stats import norm
from scipy import stats
import seaborn as sns
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pickle

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 300)
df_kc = pd.read_csv('kc_house_data_test_features.csv')

In [4]:
import pickle
loaded_model = pickle.load(open('./savedModels/final_kc_lrm_v1.sav', 'rb'))
load_zipgroups = pickle.load(open)

In [2]:

df_kc = pd.read_csv('kc_house_data_test_features.csv')

################################################################ DATA FORMATTING/CLEANING ################################################################

rnmdate = {'date':'selldate'}
df_kc.rename(columns = rnmdate, inplace = True)

df_kc.selldate = df_kc.selldate.apply(lambda x: x[:8])
df_kc.selldate = df_kc.selldate.apply(lambda x: pd.to_datetime(x, yearfirst=True, format='%Y%m%d'))

df_kc['yr_old'] = np.where(df_kc['yr_renovated'] != 0, df_kc['selldate'].apply(lambda x: x.year) - df_kc['yr_renovated'], 
                   df_kc['selldate'].apply(lambda x: x.year) - df_kc['yr_built'])
df_kc['yr_old'] = np.where(df_kc['yr_old'] < 0, 0, df_kc['yr_old'])

# turning the 0s into nans 
df_kc.yr_renovated = np.where(df_kc.yr_renovated == 0, np.nan, df_kc.yr_renovated)
# make a column that has the sqft living / sqft lot. Effectively, how big is the house in comparison to the land
df_kc['liv_lot_ratio'] = df_kc.sqft_living / df_kc.sqft_lot 

# The below np.select function is to adjust all of the houses estimation of number of bathrooms 

conditions = [  (df_kc.sqft_living <= 3000) & ((df_kc.bathrooms >= 0) & (df_kc.bathrooms <= 1.5)),
                ((df_kc.sqft_living  <= 3800) & (df_kc.sqft_living  >= 3000))  & ((df_kc.bathrooms >= 0) & (df_kc.bathrooms <= 3)), 
                ((df_kc.sqft_living  <= 4800) & (df_kc.sqft_living  > 3800))  & ((df_kc.bathrooms >= 0) & (df_kc.bathrooms <= 2)),
               (df_kc.liv_lot_ratio <= 0.001)]
choices = [ df_kc.bathrooms,
            df_kc.bathrooms.median(),
            df_kc.bedrooms,
            df_kc.bathrooms]
df_kc.bathrooms = np.select(conditions, choices, default = df_kc.bathrooms)

#The below np.select() function is to adjust all of the houses with an overestimation of bedrooms

df_kc.at[8597, 'bedrooms'] = 3
conditions = [ (df_kc.sqft_living <= 3000) & ((df_kc.bedrooms >= 1) & (df_kc.bedrooms <= 7)), # do the smaller end
                (df_kc.sqft_living  > 4000) & ((df_kc.bedrooms >= 1) & (df_kc.bedrooms <= 3)) # do the upper end
               ]
choices = [ df_kc.bedrooms.mean(),
           5
            ]
df_kc.bedrooms = np.select(conditions, choices, default = df_kc.bedrooms)




# The below np.select() function is to adjust all of the houses with an underestimation of bedrooms

conditions = [  (df_kc.sqft_living > 3000) & ((df_kc.bedrooms >= 0) & (df_kc.bedrooms <= 3)), # do the upper end
                ((df_kc.sqft_living  <= 3800) & (df_kc.sqft_living  > 2800))  & ((df_kc.bedrooms >= 0) & (df_kc.bedrooms <= 3)) # do the lower end
               ]
choices = [ 6.5,
            4.5
            ]
df_kc.bedrooms = np.select(conditions, choices, default = df_kc.bedrooms)




In [3]:
################################################################ FEATURE ENGINEERING  ################################################################

## bedrooms to bathrooms ratio
#df_kc['bed_bath'] = df_kc['bedrooms']/df_kc['bathrooms']
#            
#
#### Create feature for housing listing proximity to nearest central transit center (sites that host major rail and bus lines) .
#transit_loc = {'Northgate TC': (47.707696,-122.326842),
#           'UW': (47.651572,-122.304242) ,
#           'Westlake': (47.612982,-122.336532) ,
#           'Capitol Hill': (47.621353,-122.320111) ,
#           'Bellevue': (47.617540,-122.-195230),
#           'Pine Street, 9th ave': (47.614489,-122.332074) ,
#           'Pioneer Square': (47.603276,-122.331904) ,
#           'Kings Street': (47.598833,-122.329926) ,
#           "Int'l District/ctown": (47.598308,-122.327837) ,
#           '6th ave S & S Atlantic': (47.591008,-122.325878) ,
#           'Rainier Ave & Mt Baker TC': (47.578748,-122.297142) ,
#           'Rainier Beach': (47.524828,-122.280494),
#           'Columbia City': (47.560558,-122.293057) ,
#           'Kings County South Base': (47.499379,-122.284285) ,
#           'Tukwila': (47.463233,-122.238660) ,
#           'Renton TC': (47.488332,-122.210975),
#           'Burien TC': (47.474578,-122.334315)}
#
#loc_coord = np.array(list(zip(df_kc.lat, df_kc.long)))
#
#df_kc = df_kc[df_kc['lat'].notna()]
#df_kc = df_kc[df_kc['long'].notna()]
#
#import geopy.distance
#metro_prox = []
#for houseloc in loc_coord:
#    sortlist=[]
#    for transitloc in transit_loc.values():
#        sortlist.append(geopy.distance.great_circle(houseloc,transitloc).miles)
#        #print(len(sortlist))
#    metro_prox.append(min(sortlist))
#    
#df_kc['metro_prox'] = metro_prox


### Create zip code grouping dummies.

#zips = df_kc.groupby('zipcode').price.mean().sort_values(ascending = True)
#
#df_kc2 = zips.to_frame().reset_index()
#
#ziplist = []
#sublist = []
#for i in range(69):
#    if len(sublist) == 0:
#        sublist = [df_kc2.zipcode.iloc[i]]
#        if df_kc2.price.iloc[i+1]/df_kc2.price.iloc[i] < 1.02:
#            sublist.append(df_kc2.zipcode.iloc[i+1])
#        else:
#            ziplist.append(sublist)
#            sublist = []
#    else:
#        if df_kc2.price.iloc[i+1]/df_kc2.price.iloc[i] < 1.02:
#            sublist.append(df_kc2.zipcode.iloc[i+1])
#        else:
#            ziplist.append(sublist)
#            sublist = []
#ziplist.append([df_kc2.zipcode.iloc[69]])
#
#zipstr = [str(x) for x in ziplist]
#zipdict = dict(zip(zipstr, ziplist))
#
#zipunique = df_kc.zipcode.unique().tolist()
#ziplist1 = []
#
#for i in zipunique: 
#    for j, k in zipdict.items():
#        if i in k:
#            ziplist1.append(j)
#            
#zipdict1 = dict(zip(zipunique, ziplist1))


df_kc['zipgroup'] = df_kc['zipcode'].map(zipdict1)

df_kc = pd.concat([df_kc, pd.get_dummies(df_kc['zipgroup'])], 1)

AttributeError: 'DataFrameGroupBy' object has no attribute 'price'

In [5]:
final_answers = loaded_model.predict(pd.read_csv('kc_house_data_test_features.csv'))

ValueError: could not convert string to float: '20140827T000000'

In [None]:
final_answers.to_csv('holdout_jp_pt.csv')