In [1]:
import pandas as pd
import numpy as np 
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [2]:
holdout = pd.read_csv('data/kc_house_data_test_features.csv', index_col=0)
holdout.head()


Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [4]:
import pickle

In [5]:
infile = open('pickled_files/model.pickle','rb')
model = pickle.load(infile)
infile.close()

print(model.intercept_)
print(len(model.coef_))

11.425282165459201
117


In [6]:
infile = open("pickled_files/other_info.pickle",'rb')
other_info = pickle.load(infile)
infile.close()

In [7]:
selected_cols = other_info[0]
selected_cols

Index(['sqft_living', 'sqft_lot', 'waterfront', 'view', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'num_times_sold',
       ...
       'zipcode_98188', 'zipcode_98198', 'zipcode_98199', 'age',
       'is_renovated', 'season_sold_Spring', 'season_sold_Summer',
       'season_sold_Winter', 'winterXsqft_living', 'springXsqft_living'],
      dtype='object', length=117)

In [8]:
holdout.isna().sum()

id               0
date             0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [9]:
holdout['date'] = pd.to_datetime(holdout['date'].str.slice(0,8),format="%Y-%m-%d")

In [10]:
dictNumTimesSold = dict(holdout.groupby('id').id.count().sort_values())
holdout['num_times_sold'] = holdout['id'].map(dictNumTimesSold)

In [11]:
holdout.drop(labels =['id', 'lat', 'long'], axis=1, inplace=True)

In [12]:
holdout['zipcode'] = holdout['zipcode'].apply(str)

In [13]:
holdout['bedrooms'] = np.where(holdout['bedrooms'] >= 7, 7, holdout['bedrooms'])

In [14]:
holdout['bathrooms'] = np.where(((holdout['bathrooms']>0)&(holdout['bathrooms']<1)), 1, holdout['bathrooms'])

holdout['bathrooms'] = np.where(holdout['bathrooms']>=5.75, 7, holdout['bathrooms'])

In [15]:
conditions = [(holdout['grade'] <= 8),
              (holdout['grade'] <= 12),
             (holdout['grade'] <= 13)]
choices = ['low','medium','high']
holdout['grade_cat'] = np.select(conditions, choices)

In [16]:
categorical = ['condition','grade_cat','bedrooms', 'bathrooms','floors','zipcode']
holdout = pd.get_dummies(holdout, columns=categorical, drop_first=True)


In [17]:
holdout['age'] = 2021 - holdout.yr_built


In [18]:
holdout['month_sold'] = holdout['date'].dt.month

In [19]:
seasons = ['Winter', 'Winter', 'Spring', 'Spring', 'Spring', 'Summer', 'Summer', 'Summer', 'Fall', 'Fall', 'Fall', 'Winter']
#{1: Winter, 2: Spring, 3: Summer, 4: Fall}
month_to_season = dict(zip(range(1,13), seasons))
holdout['season_sold'] = holdout.month_sold.map(month_to_season)

In [20]:
holdout['year_sold'] = holdout['date'].dt.year

In [21]:
holdout['is_renovated'] = np.where(holdout['yr_renovated'] > 0, 1, 0)


In [22]:
holdout = pd.get_dummies(holdout, columns = ['season_sold'], drop_first=True)
holdout['winterXsqft_living'] = holdout['season_sold_Winter']*holdout['sqft_living']
holdout['springXsqft_living'] = holdout['season_sold_Spring']*holdout['sqft_living']
holdout['ageXnumTimesSold'] = holdout['age']*holdout['num_times_sold']



In [23]:
selected_cols

Index(['sqft_living', 'sqft_lot', 'waterfront', 'view', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'num_times_sold',
       ...
       'zipcode_98188', 'zipcode_98198', 'zipcode_98199', 'age',
       'is_renovated', 'season_sold_Spring', 'season_sold_Summer',
       'season_sold_Winter', 'winterXsqft_living', 'springXsqft_living'],
      dtype='object', length=117)

In [24]:
transformed_holdout = holdout[selected_cols]

## Step 3: Predict the holdout set

In [25]:
final_answers = np.exp(model.predict(transformed_holdout))

In [26]:
final_answers_df = pd.DataFrame(final_answers)

## Step 4: Export your predictions

In [27]:
# final_answers_df.to_csv('housing_preds_jason_arikupurathu.csv',index=False)