# King County House Price Prediction

In [9]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import statsmodels.formula.api as smf


In [3]:
king_county_house = pd.read_csv("data/King_County_House_prices_dataset.csv", delimiter=",")

In [5]:
# Set correct format
king_county_house["id"] = king_county_house["id"].astype("str")
king_county_house["date"] = pd.to_datetime(king_county_house["date"]).dt.date
king_county_house["price"] = king_county_house["price"].astype("int")
king_county_house["waterfront"] = king_county_house["waterfront"].astype("category")
king_county_house["view"] = king_county_house["view"].fillna(0).astype("int")
king_county_house["condition"] = king_county_house["condition"].astype("category")
king_county_house["grade"] = king_county_house["grade"].astype("category")
king_county_house["sqft_basement"] = pd.to_numeric(king_county_house["sqft_basement"], errors='coerce')
king_county_house["zipcode"] = king_county_house["zipcode"].astype("category")
king_county_house["yr_renovated"] = king_county_house["yr_renovated"].fillna(0).astype("int")

# Remove outlier
king_county_house = king_county_house[king_county_house["bedrooms"] != 33]

In [6]:
king_county_house.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [10]:
col_names = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat','sqft_living15', 'sqft_lot15']
r_values = []

for predictor in col_names:
    r_values.append(smf.ols(formula="price ~ %s" % predictor, data=king_county_house).fit().rsquared_adj)

r_values = pd.DataFrame(r_values, col_names)
r_values.columns = ["adj_r-squared"]
r_values.sort_values("adj_r-squared", ascending=False)

Unnamed: 0,adj_r-squared
grade,0.519769
sqft_living,0.492681
sqft_above,0.36647
sqft_living15,0.342507
bathrooms,0.276553
view,0.154805
sqft_basement,0.105585
bedrooms,0.099789
lat,0.094015
waterfront,0.076292


In [11]:
# Feature selection
X = king_county_house[["grade", "sqft_living", "sqft_above", "sqft_living15", "bathrooms"]]
Y = king_county_house["price"]

# Splitting data
print("-----  Splitting the data in train and test ----")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Adding the constant
X_train = sm.add_constant(X_train) # adding a constant
X_test = sm.add_constant(X_test) # adding a constant

# Training the model
print("-----  Training the model ----")
model = sm.OLS(y_train, X_train).fit()
print_model = model.summary()

-----  Splitting the data in train and test ----
-----  Training the model ----


In [12]:

# Predictions to check the model
print("-----  Evaluating the model ----")
predictions = model.predict(X_train)
err_train = np.sqrt(mean_squared_error(y_train, predictions))
predictions_test = model.predict(X_test)
err_test = np.sqrt(mean_squared_error(y_test, predictions_test))


print(print_model)
print ("-------------")
print (f"RMSE on train data: {err_train:,.0f}")
print (f"RMSE on test data: {err_test:,.0f}")

-----  Evaluating the model ----
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.548
Model:                            OLS   Adj. R-squared:                  0.548
Method:                 Least Squares   F-statistic:                     3506.
Date:                Sun, 18 Oct 2020   Prob (F-statistic):               0.00
Time:                        22:34:00   Log-Likelihood:            -2.0031e+05
No. Observations:               14469   AIC:                         4.006e+05
Df Residuals:                   14463   BIC:                         4.007e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const        