# Import libraries and csv file

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import datetime as dt

In [60]:
# Read file in as Pandas dataframe

df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


# Preproccess and clean datatypes, null values, and value ranges

In [61]:
df['sqft_basement'] = pd.to_numeric(df['sqft_basement'], errors = 'coerce')

df['date'] = pd.to_datetime(df['date'], infer_datetime_format = True)

df['yrs_old'] = [i.year for i in df['date']] - df['yr_built']

df.drop(['date'], axis = 1, inplace = True)

In [62]:
df['view'] = df['view'].fillna(df['view'].mode()[0])

df['waterfront'] = df['waterfront'].fillna(df['waterfront'].mode()[0])

df['renovated'] = [0 if i == 0.0 else 1 for i in df['yr_renovated']]
df.drop(['yr_renovated'], axis = 1, inplace = True)

df['basement'] = [0 if i == 0.0 else 1 for i in df['sqft_basement']]
df.drop(['sqft_basement'], axis = 1, inplace = True)

In [63]:
df.drop(['yr_built', 'id'], axis = 1, inplace = True)

In [64]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,zipcode,lat,long,sqft_living15,sqft_lot15,yrs_old,renovated,basement
0,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,98178,47.5112,-122.257,1340,5650,59,0,0
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,98125,47.721,-122.319,1690,7639,63,1,1
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,98028,47.7379,-122.233,2720,8062,82,1,0
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,98136,47.5208,-122.393,1360,5000,49,0,1
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,98074,47.6168,-122.045,1800,7503,28,0,0


In [65]:
df = df[(df['price'] >= 0) & (df['price'] <= 1000000)]
df = df[(df['bedrooms'] >= 0) & (df['bedrooms'] <= 7)]
df = df[(df['bathrooms'] >= 0) & (df['bathrooms'] <= 5)]

# Import distance to city feature

In [66]:
from haversine import haversine

seattle = [47.6092, -122.3363]
bellevue = [47.61555, -122.20392]

In [67]:
haversine(seattle, bellevue, unit ='mi')

6.181695095872431

In [68]:
df['lat_long'] = tuple(zip(df.lat, df.long))

In [69]:
seattle_distances = []
for i in df['lat_long']:
    seattle_distances.append(haversine((seattle), (i), unit = 'mi'))
df['distance_from_seattle'] = pd.Series(seattle_distances)

bellevue_distances = []
for i in df['lat_long']:
    bellevue_distances.append(haversine((bellevue), (i), unit = 'mi'))
df['distance_from_bellevue'] = pd.Series(bellevue_distances)

In [70]:
df['distance_from_city'] = df[['distance_from_bellevue', 'distance_from_seattle']].min(axis = 1)

In [71]:
df.drop(['lat', 'long', 'lat_long', 'distance_from_seattle', 'distance_from_bellevue'], axis = 1, inplace = True)

In [72]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,zipcode,sqft_living15,sqft_lot15,yrs_old,renovated,basement,distance_from_city
0,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,98178,1340,5650,59,0,0,7.622786
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,98125,1690,7639,63,1,1,7.766476
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,98028,2720,8062,82,1,0,8.561145
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,98136,1360,5000,49,0,1,6.65534
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,98074,1800,7503,28,0,0,7.402273


In [49]:
df.corr()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,zipcode,sqft_living15,sqft_lot15,yrs_old,renovated,basement,distance_from_city
price,1.0,0.295275,0.440243,0.604644,0.094318,0.269486,0.051813,0.21312,0.033946,0.617743,0.515704,-0.01954,0.549575,0.082325,-0.05876,0.025985,0.15305,-0.009714
bedrooms,0.295275,1.0,0.499567,0.60472,0.026786,0.157301,-0.027363,0.031535,0.023294,0.330037,0.479849,-0.159746,0.389061,0.024143,-0.173569,-0.013175,0.142414,0.004784
bathrooms,0.440243,0.499567,1.0,0.709769,0.062676,0.504069,-0.006228,0.080674,-0.139028,0.6082,0.630987,-0.214744,0.52403,0.063067,-0.55438,-0.005848,0.127784,-0.003733
sqft_living,0.604644,0.60472,0.709769,1.0,0.157876,0.33663,0.00069,0.147725,-0.074536,0.69499,0.849608,-0.211589,0.734272,0.172156,-0.358993,-0.004587,0.168651,-0.005816
sqft_lot,0.094318,0.026786,0.062676,0.157876,1.0,-0.018346,0.025736,0.081479,-0.002288,0.090518,0.159969,-0.128953,0.140817,0.705859,-0.040458,-0.008173,-0.033794,-0.002941
floors,0.269486,0.157301,0.504069,0.33663,-0.018346,1.0,-0.006607,-0.018082,-0.278563,0.457574,0.528446,-0.067107,0.26722,-0.022568,-0.515483,-0.010041,-0.278099,-0.017548
waterfront,0.051813,-0.027363,-0.006228,0.00069,0.025736,-0.006607,1.0,0.28232,0.011941,-0.013079,-0.009791,0.043204,0.005155,0.031845,0.034956,0.018796,0.017433,0.002341
view,0.21312,0.031535,0.080674,0.147725,0.081479,-0.018082,0.28232,1.0,0.030999,0.114924,0.05313,0.101812,0.168911,0.078011,0.068833,0.02401,0.1334,-0.014288
condition,0.033946,0.023294,-0.139028,-0.074536,-0.002288,-0.278563,0.011941,0.030999,1.0,-0.17414,-0.179969,-0.007297,-0.122257,0.002989,0.350477,-0.026073,0.123152,0.005276
grade,0.617743,0.330037,0.6082,0.69499,0.090518,0.457574,-0.013079,0.114924,-0.17414,1.0,0.700805,-0.189264,0.662988,0.100542,-0.499014,-0.018061,9.8e-05,-0.016431


## Model with Distance to City Feature

In [48]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

outcome = 'price'
predictors = df.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,1446.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,14:46:52,Log-Likelihood:,-247620.0
No. Observations:,18771,AIC:,495300.0
Df Residuals:,18753,BIC:,495400.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.191e+07,1.98e+06,-6.031,0.000,-1.58e+07,-8.04e+06
bedrooms,-1.303e+04,1415.908,-9.204,0.000,-1.58e+04,-1.03e+04
bathrooms,2.643e+04,2355.568,11.219,0.000,2.18e+04,3.1e+04
sqft_living,45.1424,4.545,9.932,0.000,36.233,54.051
sqft_lot,0.1522,0.033,4.627,0.000,0.088,0.217
floors,4.635e+04,2623.732,17.665,0.000,4.12e+04,5.15e+04
waterfront,1.035e+05,1.92e+04,5.385,0.000,6.58e+04,1.41e+05
view,1.593e+04,1646.942,9.671,0.000,1.27e+04,1.92e+04
condition,1.976e+04,1563.141,12.640,0.000,1.67e+04,2.28e+04

0,1,2,3
Omnibus:,476.549,Durbin-Watson:,1.968
Prob(Omnibus):,0.0,Jarque-Bera (JB):,573.902
Skew:,0.337,Prob(JB):,2.39e-125
Kurtosis:,3.53,Cond. No.,210000000.0


In [50]:
# Drop column due to high p-value

df.drop(['distance_from_city'], axis = 1, inplace = True)

In [51]:
outcome = 'price'
predictors = df.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,1646.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,14:48:42,Log-Likelihood:,-265300.0
No. Observations:,20116,AIC:,530600.0
Df Residuals:,20099,BIC:,530800.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.439e+07,1.92e+06,-7.499,0.000,-1.81e+07,-1.06e+07
bedrooms,-1.361e+04,1376.105,-9.892,0.000,-1.63e+04,-1.09e+04
bathrooms,2.536e+04,2264.360,11.200,0.000,2.09e+04,2.98e+04
sqft_living,46.1319,4.417,10.445,0.000,37.475,54.789
sqft_lot,0.1563,0.032,4.832,0.000,0.093,0.220
floors,4.979e+04,2436.079,20.441,0.000,4.5e+04,5.46e+04
waterfront,1.03e+05,1.91e+04,5.386,0.000,6.55e+04,1.41e+05
view,1.565e+04,1612.038,9.708,0.000,1.25e+04,1.88e+04
condition,1.989e+04,1554.384,12.794,0.000,1.68e+04,2.29e+04

0,1,2,3
Omnibus:,521.072,Durbin-Watson:,1.964
Prob(Omnibus):,0.0,Jarque-Bera (JB):,629.514
Skew:,0.34,Prob(JB):,2.01e-137
Kurtosis:,3.537,Cond. No.,211000000.0


In [52]:
# drop column due to correlation with sqft features

df.drop(['grade'], axis = 1, inplace = True)

In [53]:
outcome = 'price'
predictors = df.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.493
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,1301.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,14:50:02,Log-Likelihood:,-266890.0
No. Observations:,20116,AIC:,533800.0
Df Residuals:,20100,BIC:,533900.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.072e+07,2.07e+06,-9.991,0.000,-2.48e+07,-1.67e+07
bedrooms,-2.36e+04,1478.403,-15.961,0.000,-2.65e+04,-2.07e+04
bathrooms,3.421e+04,2446.002,13.985,0.000,2.94e+04,3.9e+04
sqft_living,63.8724,4.770,13.390,0.000,54.522,73.222
sqft_lot,0.1575,0.035,4.499,0.000,0.089,0.226
floors,6.707e+04,2618.076,25.618,0.000,6.19e+04,7.22e+04
waterfront,6.92e+04,2.07e+04,3.343,0.001,2.86e+04,1.1e+05
view,2.013e+04,1743.257,11.547,0.000,1.67e+04,2.35e+04
condition,2.142e+04,1682.551,12.731,0.000,1.81e+04,2.47e+04

0,1,2,3
Omnibus:,495.462,Durbin-Watson:,1.964
Prob(Omnibus):,0.0,Jarque-Bera (JB):,538.181
Skew:,0.378,Prob(JB):,1.37e-117
Kurtosis:,3.264,Cond. No.,211000000.0


In [54]:
# drop column due to high p-value

df.drop(['renovated'], axis = 1, inplace = True)

In [55]:
outcome = 'price'
predictors = df.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.493
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,1394.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,14:50:39,Log-Likelihood:,-266890.0
No. Observations:,20116,AIC:,533800.0
Df Residuals:,20101,BIC:,533900.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.069e+07,2.07e+06,-9.974,0.000,-2.48e+07,-1.66e+07
bedrooms,-2.365e+04,1478.083,-16.001,0.000,-2.65e+04,-2.08e+04
bathrooms,3.439e+04,2443.622,14.072,0.000,2.96e+04,3.92e+04
sqft_living,63.9091,4.770,13.397,0.000,54.559,73.259
sqft_lot,0.1569,0.035,4.482,0.000,0.088,0.226
floors,6.711e+04,2618.085,25.633,0.000,6.2e+04,7.22e+04
waterfront,6.957e+04,2.07e+04,3.361,0.001,2.9e+04,1.1e+05
view,2.016e+04,1743.219,11.566,0.000,1.67e+04,2.36e+04
condition,2.125e+04,1679.338,12.653,0.000,1.8e+04,2.45e+04

0,1,2,3
Omnibus:,496.647,Durbin-Watson:,1.963
Prob(Omnibus):,0.0,Jarque-Bera (JB):,539.646
Skew:,0.379,Prob(JB):,6.5700000000000005e-118
Kurtosis:,3.265,Cond. No.,211000000.0


In [56]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [58]:
dfi = df.drop(['price'], axis = 1)

for i in range(len(dfi.columns[:-1])):
    v = vif(np.matrix(dfi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 26.25
Variance inflation factor for bathrooms: 28.41
Variance inflation factor for sqft_living: 102.34
Variance inflation factor for sqft_lot: 2.27
Variance inflation factor for floors: 16.77
Variance inflation factor for waterfront: 1.09
Variance inflation factor for view: 1.26
Variance inflation factor for condition: 34.16
Variance inflation factor for sqft_above: 90.16
Variance inflation factor for zipcode: 76.03
Variance inflation factor for sqft_living15: 27.31
Variance inflation factor for sqft_lot15: 2.48
Variance inflation factor for yrs_old: 6.1


### Even after columns with high p-values are dropped, there remains several columns with high vif due to the correlation between square footage and number of rooms

## Model with Zip to School District Conversion Feature

### Must skip executing model with distance to city cells to run successfully

In [73]:
def group_zips(z):
    if z in [98177, 98133, 98125, 98155]:
        return 'Shoreline'
    
    elif z in [98028, 98011, 98072, 98077]:
        return 'Northshore'
    
    elif z in [98019, 98014]:
        return 'Riverview'
    
    elif z in [98117, 98103, 98115, 98107, 98105, 98199, 98119, 98109, 98102, 
               98112, 98122, 98116, 98136, 98126,
               98106, 98144, 98108, 98118]:
        return 'Seattle'
    
    elif z in [98040]:
        return 'MercerIsland'
    
    elif z in [98034, 98052, 98053, 98033, 98074]:
        return 'LakeWashington'
    
    elif z in [98039, 98004, 98005, 98007, 98008, 98006]:
        return 'Bellevue'
    
    elif z in [98059, 98027, 98075, 98029]:
        return 'Issaquah'
    
    elif z in [98024, 98065, 98045]:
        return 'SnoqualmieValley'
    
    elif z in [98168]:
        return 'Tukwila'
    
    elif z in [98178, 98056, 98055]:
        return 'Renton'
    
    elif z in [98070]:
        return 'VashonIsland'
    
    elif z in [98146, 98166, 98148, 98188]:
        return 'Highline'
    
    elif z in [98198, 98023, 98003, 98001]:
        return 'FederalWay'
    
    elif z in [98032, 98031, 98030, 98042, 98058]:
        return 'Kent'
    
    elif z in [98002, 98092]:
        return 'Auburn'
    
    elif z in [98038]:
        return 'Tahoma'
    
    elif z in [98022, 98010]:
        return 'Enumclaw'

In [74]:
df['zipcode'] = df['zipcode'].map(group_zips)

In [75]:
df['zipcode'].head()

0            Renton
1         Shoreline
2        Northshore
3           Seattle
4    LakeWashington
Name: zipcode, dtype: object

In [76]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder()
ohe = ohc.fit_transform(df.zipcode.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(ohe, columns = ['SD_' + str(ohc.categories_[0][i]) for i in range(len(ohc.categories_[0]))])
dfh = pd.concat([df, dfOneHot], axis = 1)
dfh.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,SD_MercerIsland,SD_Northshore,SD_Renton,SD_Riverview,SD_Seattle,SD_Shoreline,SD_SnoqualmieValley,SD_Tahoma,SD_Tukwila,SD_VashonIsland
0,221900.0,3.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,538000.0,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,180000.0,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,604000.0,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,510000.0,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
dfh.drop(['zipcode'], axis = 1, inplace = True)

In [78]:
dfh.isnull().sum()

price                  1345
bedrooms               1345
bathrooms              1345
sqft_living            1345
sqft_lot               1345
floors                 1345
waterfront             1345
view                   1345
condition              1345
grade                  1345
sqft_above             1345
sqft_living15          1345
sqft_lot15             1345
yrs_old                1345
renovated              1345
basement               1345
distance_from_city     2690
SD_Auburn              1345
SD_Bellevue            1345
SD_Enumclaw            1345
SD_FederalWay          1345
SD_Highline            1345
SD_Issaquah            1345
SD_Kent                1345
SD_LakeWashington      1345
SD_MercerIsland        1345
SD_Northshore          1345
SD_Renton              1345
SD_Riverview           1345
SD_Seattle             1345
SD_Shoreline           1345
SD_SnoqualmieValley    1345
SD_Tahoma              1345
SD_Tukwila             1345
SD_VashonIsland        1345
dtype: int64

In [79]:
# remove null values

dfh.dropna(inplace = True)

In [80]:
dfh.isnull().sum()

price                  0
bedrooms               0
bathrooms              0
sqft_living            0
sqft_lot               0
floors                 0
waterfront             0
view                   0
condition              0
grade                  0
sqft_above             0
sqft_living15          0
sqft_lot15             0
yrs_old                0
renovated              0
basement               0
distance_from_city     0
SD_Auburn              0
SD_Bellevue            0
SD_Enumclaw            0
SD_FederalWay          0
SD_Highline            0
SD_Issaquah            0
SD_Kent                0
SD_LakeWashington      0
SD_MercerIsland        0
SD_Northshore          0
SD_Renton              0
SD_Riverview           0
SD_Seattle             0
SD_Shoreline           0
SD_SnoqualmieValley    0
SD_Tahoma              0
SD_Tukwila             0
SD_VashonIsland        0
dtype: int64

In [23]:
dfh.corr()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,SD_MercerIsland,SD_Northshore,SD_Renton,SD_Riverview,SD_Seattle,SD_Shoreline,SD_SnoqualmieValley,SD_Tahoma,SD_Tukwila,SD_VashonIsland
price,1.0,0.291698,0.439438,0.602094,0.094643,0.282191,0.054128,0.219247,0.044266,0.615175,...,0.004805,0.000721,-0.010767,0.003276,0.008573,-0.006962,0.008737,-0.007076,0.008359,0.009228
bedrooms,0.291698,1.0,0.503442,0.59615,0.026489,0.181473,-0.028061,0.031327,0.028414,0.332639,...,0.003114,-0.000378,-0.002284,0.000122,0.002742,-0.014429,0.008821,-0.001687,0.003084,0.015361
bathrooms,0.439438,0.503442,1.0,0.71812,0.073487,0.501442,-0.003788,0.088956,-0.112225,0.605444,...,0.007061,-0.004958,-0.007836,0.0019,0.000446,-0.010297,0.009633,0.001403,0.007579,0.015615
sqft_living,0.602094,0.59615,0.71812,1.0,0.162446,0.368986,0.001794,0.151781,-0.064781,0.702518,...,0.004581,-0.000954,-0.010114,-0.004948,0.008893,-0.01185,0.006008,-0.006124,0.00386,0.025437
sqft_lot,0.094643,0.026489,0.073487,0.162446,1.0,0.002711,0.025533,0.080124,-0.010569,0.099656,...,-0.004971,-0.005997,-0.006032,-0.011182,0.00773,-0.012466,0.012329,-0.009772,-0.000851,0.006621
floors,0.282191,0.181473,0.501442,0.368986,0.002711,1.0,-0.002508,-0.005774,-0.246995,0.454735,...,0.005464,-0.011157,-0.007772,-0.003265,0.019377,-0.015259,0.017033,-0.008524,0.001685,0.000945
waterfront,0.054128,-0.028061,-0.003788,0.001794,0.025533,-0.002508,1.0,0.286224,0.009884,-0.011362,...,-0.00448,0.003689,-0.001548,0.001958,-0.007483,0.000428,-0.00886,-0.002905,0.002982,-0.00395
view,0.219247,0.031327,0.088956,0.151781,0.080124,-0.005774,0.286224,1.0,0.025423,0.123838,...,-0.006406,0.003057,-0.005635,0.003856,-0.005572,0.007787,-0.00409,-0.016019,0.006492,0.008546
condition,0.044266,0.028414,-0.112225,-0.064781,-0.010569,-0.246995,0.009884,0.025423,1.0,-0.154718,...,-0.006348,0.005837,0.007265,0.006589,-0.006055,0.005255,-0.000294,0.000756,-0.002763,-0.001386
grade,0.615175,0.332639,0.605444,0.702518,0.099656,0.454735,-0.011362,0.123838,-0.154718,1.0,...,0.007556,0.001837,-0.011668,-0.003457,0.013915,-0.0066,0.006771,0.002512,0.003234,0.012946


In [81]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,743.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,15:26:17,Log-Likelihood:,-247630.0
No. Observations:,18771,AIC:,495300.0
Df Residuals:,18737,BIC:,495600.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.382e+05,1.25e+04,-51.181,0.000,-6.63e+05,-6.14e+05
bedrooms,-1.348e+04,1415.209,-9.523,0.000,-1.63e+04,-1.07e+04
bathrooms,2.597e+04,2357.116,11.016,0.000,2.13e+04,3.06e+04
sqft_living,45.6192,4.551,10.025,0.000,36.700,54.539
sqft_lot,0.1431,0.033,4.347,0.000,0.079,0.208
floors,4.828e+04,2602.148,18.553,0.000,4.32e+04,5.34e+04
waterfront,1.034e+05,1.92e+04,5.374,0.000,6.57e+04,1.41e+05
view,1.692e+04,1639.757,10.320,0.000,1.37e+04,2.01e+04
condition,1.854e+04,1550.530,11.958,0.000,1.55e+04,2.16e+04

0,1,2,3
Omnibus:,440.481,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,529.826
Skew:,0.32,Prob(JB):,8.910000000000001e-116
Kurtosis:,3.518,Cond. No.,4120000000000000.0


In [82]:
# drop column due to p-value

dfh.drop(['distance_from_city'], axis = 1, inplace = True)

In [83]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,766.3
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,15:26:33,Log-Likelihood:,-247630.0
No. Observations:,18771,AIC:,495300.0
Df Residuals:,18738,BIC:,495600.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.404e+05,1.12e+04,-57.290,0.000,-6.62e+05,-6.19e+05
bedrooms,-1.347e+04,1415.155,-9.521,0.000,-1.62e+04,-1.07e+04
bathrooms,2.597e+04,2356.979,11.020,0.000,2.14e+04,3.06e+04
sqft_living,45.6036,4.550,10.022,0.000,36.685,54.523
sqft_lot,0.1430,0.033,4.346,0.000,0.079,0.208
floors,4.829e+04,2601.913,18.559,0.000,4.32e+04,5.34e+04
waterfront,1.034e+05,1.92e+04,5.371,0.000,6.56e+04,1.41e+05
view,1.693e+04,1639.482,10.329,0.000,1.37e+04,2.01e+04
condition,1.854e+04,1550.487,11.959,0.000,1.55e+04,2.16e+04

0,1,2,3
Omnibus:,440.282,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,529.496
Skew:,0.32,Prob(JB):,1.05e-115
Kurtosis:,3.517,Cond. No.,2.45e+16


In [84]:
# drop column due to p-value

dfh.drop(['renovated'], axis = 1, inplace = True)

In [85]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,791.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,15:26:40,Log-Likelihood:,-247630.0
No. Observations:,18771,AIC:,495300.0
Df Residuals:,18739,BIC:,495600.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.402e+05,1.12e+04,-57.305,0.000,-6.62e+05,-6.18e+05
bedrooms,-1.349e+04,1414.857,-9.535,0.000,-1.63e+04,-1.07e+04
bathrooms,2.604e+04,2354.381,11.060,0.000,2.14e+04,3.07e+04
sqft_living,45.6135,4.550,10.024,0.000,36.695,54.532
sqft_lot,0.1428,0.033,4.340,0.000,0.078,0.207
floors,4.829e+04,2601.862,18.558,0.000,4.32e+04,5.34e+04
waterfront,1.035e+05,1.92e+04,5.378,0.000,6.58e+04,1.41e+05
view,1.694e+04,1639.357,10.336,0.000,1.37e+04,2.02e+04
condition,1.848e+04,1547.293,11.945,0.000,1.55e+04,2.15e+04

0,1,2,3
Omnibus:,440.824,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,530.401
Skew:,0.32,Prob(JB):,6.68e-116
Kurtosis:,3.518,Cond. No.,1.96e+16


In [86]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [87]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 1.68
Variance inflation factor for bathrooms: 2.96
Variance inflation factor for sqft_living: 12.76
Variance inflation factor for sqft_lot: 2.0
Variance inflation factor for floors: 1.98
Variance inflation factor for waterfront: 1.1
Variance inflation factor for view: 1.19
Variance inflation factor for condition: 1.16
Variance inflation factor for grade: 2.7
Variance inflation factor for sqft_above: 13.2
Variance inflation factor for sqft_living15: 2.6
Variance inflation factor for sqft_lot15: 2.03
Variance inflation factor for yrs_old: 1.83
Variance inflation factor for basement: 3.33
Variance inflation factor for SD_Auburn: 5.17
Variance inflation factor for SD_Bellevue: 9.08
Variance inflation factor for SD_Enumclaw: 3.5
Variance inflation factor for SD_FederalWay: 11.72
Variance inflation factor for SD_Highline: 6.34
Variance inflation factor for SD_Issaquah: 12.04
Variance inflation factor for SD_Kent: 13.47
Variance inflation factor for SD_

In [88]:
dfh.drop(['grade', 'sqft_above'], axis = 1, inplace = True)

In [89]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 1.66
Variance inflation factor for bathrooms: 2.94
Variance inflation factor for sqft_living: 3.99
Variance inflation factor for sqft_lot: 1.99
Variance inflation factor for floors: 1.75
Variance inflation factor for waterfront: 1.09
Variance inflation factor for view: 1.18
Variance inflation factor for condition: 1.15
Variance inflation factor for sqft_living15: 2.33
Variance inflation factor for sqft_lot15: 2.03
Variance inflation factor for yrs_old: 1.77
Variance inflation factor for basement: 1.37
Variance inflation factor for SD_Auburn: 2.98
Variance inflation factor for SD_Bellevue: 4.82
Variance inflation factor for SD_Enumclaw: 2.19
Variance inflation factor for SD_FederalWay: 6.1
Variance inflation factor for SD_Highline: 3.51
Variance inflation factor for SD_Issaquah: 6.22
Variance inflation factor for SD_Kent: 6.93
Variance inflation factor for SD_LakeWashington: 8.91
Variance inflation factor for SD_MercerIsland: 1.54
Variance inflati

In [90]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.493
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,628.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,15:27:16,Log-Likelihood:,-249110.0
No. Observations:,18771,AIC:,498300.0
Df Residuals:,18741,BIC:,498500.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.87e+05,8379.164,-22.313,0.000,-2.03e+05,-1.71e+05
bedrooms,-2.338e+04,1519.119,-15.393,0.000,-2.64e+04,-2.04e+04
bathrooms,3.441e+04,2539.977,13.548,0.000,2.94e+04,3.94e+04
sqft_living,99.6076,2.752,36.200,0.000,94.214,105.001
sqft_lot,0.1389,0.036,3.902,0.000,0.069,0.209
floors,7.184e+04,2647.882,27.130,0.000,6.66e+04,7.7e+04
waterfront,7.46e+04,2.08e+04,3.585,0.000,3.38e+04,1.15e+05
view,2.028e+04,1762.835,11.506,0.000,1.68e+04,2.37e+04
condition,1.777e+04,1668.808,10.650,0.000,1.45e+04,2.1e+04

0,1,2,3
Omnibus:,392.7,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,422.509
Skew:,0.346,Prob(JB):,1.79e-92
Kurtosis:,3.25,Cond. No.,2.75e+16


In [92]:
# drop columns with high p-value

dfh.drop(['SD_Tukwila', 'SD_VashonIsland', 'SD_SnoqualmieValley', 'SD_Riverview', 'SD_MercerIsland', 'SD_Highline', 'SD_Auburn'], axis = 1, inplace = True)

In [93]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 23.83
Variance inflation factor for bathrooms: 26.87
Variance inflation factor for sqft_living: 28.92
Variance inflation factor for sqft_lot: 2.26
Variance inflation factor for floors: 12.73
Variance inflation factor for waterfront: 1.1
Variance inflation factor for view: 1.25
Variance inflation factor for condition: 21.77
Variance inflation factor for sqft_living15: 22.79
Variance inflation factor for sqft_lot15: 2.49
Variance inflation factor for yrs_old: 5.78
Variance inflation factor for basement: 2.25
Variance inflation factor for SD_Bellevue: 1.36
Variance inflation factor for SD_Enumclaw: 1.11
Variance inflation factor for SD_FederalWay: 1.48
Variance inflation factor for SD_Issaquah: 1.49
Variance inflation factor for SD_Kent: 1.56
Variance inflation factor for SD_LakeWashington: 1.75
Variance inflation factor for SD_Northshore: 1.31
Variance inflation factor for SD_Renton: 1.32
Variance inflation factor for SD_Seattle: 2.8
Variance infla

In [94]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.493
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,792.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,15:30:13,Log-Likelihood:,-249110.0
No. Observations:,18771,AIC:,498300.0
Df Residuals:,18747,BIC:,498500.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.917e+05,9143.097,-20.964,0.000,-2.1e+05,-1.74e+05
bedrooms,-2.337e+04,1518.826,-15.390,0.000,-2.64e+04,-2.04e+04
bathrooms,3.442e+04,2539.379,13.555,0.000,2.94e+04,3.94e+04
sqft_living,99.5593,2.750,36.200,0.000,94.168,104.950
sqft_lot,0.1389,0.036,3.904,0.000,0.069,0.209
floors,7.189e+04,2646.838,27.162,0.000,6.67e+04,7.71e+04
waterfront,7.45e+04,2.08e+04,3.582,0.000,3.37e+04,1.15e+05
view,2.028e+04,1762.447,11.505,0.000,1.68e+04,2.37e+04
condition,1.777e+04,1668.520,10.652,0.000,1.45e+04,2.1e+04

0,1,2,3
Omnibus:,392.696,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,422.493
Skew:,0.346,Prob(JB):,1.81e-92
Kurtosis:,3.25,Cond. No.,1010000.0


In [95]:
dfh.drop(['bedrooms', 'bathrooms', 'sqft_living15'], axis = 1, inplace = True)

In [96]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.437
Model:,OLS,Adj. R-squared:,0.437
Method:,Least Squares,F-statistic:,729.0
Date:,"Thu, 18 Feb 2021",Prob (F-statistic):,0.0
Time:,15:45:35,Log-Likelihood:,-250080.0
No. Observations:,18771,AIC:,500200.0
Df Residuals:,18750,BIC:,500400.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8.167e+04,8444.502,-9.671,0.000,-9.82e+04,-6.51e+04
sqft_living,157.8647,1.781,88.640,0.000,154.374,161.356
sqft_lot,0.1135,0.037,3.031,0.002,0.040,0.187
floors,7.595e+04,2640.193,28.766,0.000,7.08e+04,8.11e+04
waterfront,5.622e+04,2.19e+04,2.568,0.010,1.33e+04,9.91e+04
view,3.267e+04,1833.116,17.824,0.000,2.91e+04,3.63e+04
condition,1.453e+04,1747.111,8.315,0.000,1.11e+04,1.8e+04
sqft_lot15,-0.1877,0.057,-3.274,0.001,-0.300,-0.075
yrs_old,1662.6288,46.697,35.605,0.000,1571.099,1754.159

0,1,2,3
Omnibus:,360.606,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,381.324
Skew:,0.344,Prob(JB):,1.57e-83
Kurtosis:,3.114,Cond. No.,1010000.0


In [97]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for sqft_living: 11.22
Variance inflation factor for sqft_lot: 2.26
Variance inflation factor for floors: 10.16
Variance inflation factor for waterfront: 1.1
Variance inflation factor for view: 1.22
Variance inflation factor for condition: 17.6
Variance inflation factor for sqft_lot15: 2.47
Variance inflation factor for yrs_old: 5.01
Variance inflation factor for basement: 2.09
Variance inflation factor for SD_Bellevue: 1.35
Variance inflation factor for SD_Enumclaw: 1.11
Variance inflation factor for SD_FederalWay: 1.47
Variance inflation factor for SD_Issaquah: 1.48
Variance inflation factor for SD_Kent: 1.54
Variance inflation factor for SD_LakeWashington: 1.72
Variance inflation factor for SD_Northshore: 1.3
Variance inflation factor for SD_Renton: 1.31
Variance inflation factor for SD_Seattle: 2.75
Variance inflation factor for SD_Shoreline: 1.51


### With the square foor centric model approach you either end up with high p-values of school districts and high vif of housing features or the inverse, while retaining a relatively low r-squared