# Model house prices in King County

In [1]:
#importing libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
import seaborn as sns
from scipy import stats
import statsmodels.api as sms
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
#loading data
kingcnt = pd.read_csv("data/King_County_House_prices_dataset.csv")

In [3]:
#creating data frame
df_kingcnt = DataFrame(kingcnt,columns=['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']) 

## Dealing with missing values

In [4]:
#replacing missing values with 0
df_kingcnt.fillna({'waterfront':0, 'view':0}, inplace=True)

## Log transforming skewed data

In [5]:
df_kingcnt['price_log'] = np.log(df_kingcnt['price'])

In [6]:
df_kingcnt['sqft_lot_log'] = np.log(df_kingcnt['sqft_lot'])

In [7]:
df_kingcnt['sqft_lot15_log'] = np.log(df_kingcnt['sqft_lot15'])

## Splitting data into train and test set

In [8]:
train, test = train_test_split(df_kingcnt, test_size=0.33, random_state=42)

## Cleaning train data: removing outliers

In [9]:
train = train[train.bedrooms != 33]

In [10]:
train = train[train.sqft_living < 12000]

In [11]:
train = train[train.sqft_lot < 1100000]

In [12]:
train = train[train.sqft_above < 9000]

In [13]:
train = train[train.sqft_lot15 < 500000]

## Feature engineering

In [14]:
# training set
X_train = train[['bedrooms','bathrooms','sqft_living','sqft_lot_log','waterfront','view','grade','sqft_above','sqft_living15','sqft_lot15_log']]
y_train = train['price_log']

# testing set
X_test = test[['bedrooms','bathrooms','sqft_living','sqft_lot_log','waterfront','view','grade','sqft_above','sqft_living15','sqft_lot15_log']]
y_test = test['price_log']

# add constants
X_train = sms.add_constant(X_train)
X_test = sms.add_constant(X_test)

## Training the model

In [15]:
model = sms.OLS(y_train, X_train)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,price_log,R-squared:,0.6
Model:,OLS,Adj. R-squared:,0.6
Method:,Least Squares,F-statistic:,2172.0
Date:,"Thu, 15 Oct 2020",Prob (F-statistic):,0.0
Time:,10:32:37,Log-Likelihood:,-4617.7
No. Observations:,14465,AIC:,9257.0
Df Residuals:,14454,BIC:,9341.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.8452,0.043,273.371,0.000,11.760,11.930
bedrooms,-0.0163,0.004,-4.124,0.000,-0.024,-0.009
bathrooms,-0.0207,0.006,-3.461,0.001,-0.032,-0.009
sqft_living,0.0003,8.25e-06,34.103,0.000,0.000,0.000
sqft_lot_log,0.0001,0.008,0.014,0.989,-0.015,0.016
waterfront,0.4436,0.036,12.273,0.000,0.373,0.514
view,0.0646,0.004,15.387,0.000,0.056,0.073
grade,0.1702,0.004,40.629,0.000,0.162,0.178
sqft_above,-0.0001,7.51e-06,-14.516,0.000,-0.000,-9.43e-05

0,1,2,3
Omnibus:,12.776,Durbin-Watson:,1.987
Prob(Omnibus):,0.002,Jarque-Bera (JB):,11.403
Skew:,0.019,Prob(JB):,0.00334
Kurtosis:,2.868,Cond. No.,57200.0


## Testing the model

In [23]:
predictions = model.predict(X_train)
np.sqrt(mean_squared_error(y_train, predictions))
predictions_test = model.predict(X_test)
np.sqrt(mean_squared_error(y_test, predictions_test))

ValueError: shapes (14465,11) and (14465,11) not aligned: 11 (dim 1) != 14465 (dim 0)