In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import datetime as dt

In [135]:
# Read file in as Pandas dataframe

df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [136]:
df['sqft_basement'] = pd.to_numeric(df['sqft_basement'], errors = 'coerce')

df['date'] = pd.to_datetime(df['date'], infer_datetime_format = True)

df['yrs_old'] = [i.year for i in df['date']] - df['yr_built']

df.drop(['date'], axis = 1, inplace = True)

In [137]:
df['view'] = df['view'].fillna(df['view'].mode()[0])

df['waterfront'] = df['waterfront'].fillna(df['waterfront'].mode()[0])

df['renovated'] = [0 if i == 0.0 else 1 for i in df['yr_renovated']]
df.drop(['yr_renovated'], axis = 1, inplace = True)

df['basement'] = [0 if i == 0.0 else 1 for i in df['sqft_basement']]
df.drop(['sqft_basement'], axis = 1, inplace = True)

In [138]:
df.drop(['yr_built', 'id'], axis = 1, inplace = True)

In [139]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,zipcode,lat,long,sqft_living15,sqft_lot15,yrs_old,renovated,basement
0,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,98178,47.5112,-122.257,1340,5650,59,0,0
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,98125,47.721,-122.319,1690,7639,63,1,1
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,98028,47.7379,-122.233,2720,8062,82,1,0
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,98136,47.5208,-122.393,1360,5000,49,0,1
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,98074,47.6168,-122.045,1800,7503,28,0,0


In [140]:
df = df[(df['price'] >= 0) & (df['price'] <= 1000000)]
df = df[(df['bedrooms'] >= 0) & (df['bedrooms'] <= 7)]
df = df[(df['bathrooms'] >= 0) & (df['bathrooms'] <= 5)]

In [141]:
from haversine import haversine

seattle = [47.6092, -122.3363]
bellevue = [47.61555, -122.20392]

In [142]:
haversine(seattle, bellevue, unit ='mi')

6.181695095872431

In [143]:
df['lat_long'] = tuple(zip(df.lat, df.long))

In [144]:
seattle_distances = []
for i in df['lat_long']:
    seattle_distances.append(haversine((seattle), (i), unit = 'mi'))
df['distance_from_seattle'] = pd.Series(seattle_distances)

bellevue_distances = []
for i in df['lat_long']:
    bellevue_distances.append(haversine((bellevue), (i), unit = 'mi'))
df['distance_from_bellevue'] = pd.Series(bellevue_distances)

In [145]:
df['distance_from_city'] = df[['distance_from_bellevue', 'distance_from_seattle']].min(axis = 1)

In [146]:
df.drop(['lat', 'long', 'lat_long', 'distance_from_seattle', 'distance_from_bellevue'], axis = 1, inplace = True)

In [147]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,zipcode,sqft_living15,sqft_lot15,yrs_old,renovated,basement,distance_from_city
0,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,98178,1340,5650,59,0,0,7.622786
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,98125,1690,7639,63,1,1,7.766476
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,98028,2720,8062,82,1,0,8.561145
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,98136,1360,5000,49,0,1,6.65534
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,98074,1800,7503,28,0,0,7.402273


In [148]:
def group_zips(z):
    if z in [98177, 98133, 98125, 98155]:
        return 'Shoreline'
    
    elif z in [98028, 98011, 98072, 98077]:
        return 'Northshore'
    
    elif z in [98019, 98014]:
        return 'Riverview'
    
    elif z in [98117, 98103, 98115, 98107, 98105, 98199, 98119, 98109, 98102, 
               98112, 98122, 98116, 98136, 98126,
               98106, 98144, 98108, 98118]:
        return 'Seattle'
    
    elif z in [98040]:
        return 'MercerIsland'
    
    elif z in [98034, 98052, 98053, 98033, 98074]:
        return 'LakeWashington'
    
    elif z in [98039, 98004, 98005, 98007, 98008, 98006]:
        return 'Bellevue'
    
    elif z in [98059, 98027, 98075, 98029]:
        return 'Issaquah'
    
    elif z in [98024, 98065, 98045]:
        return 'SnoqualmieValley'
    
    elif z in [98168]:
        return 'Tukwila'
    
    elif z in [98178, 98056, 98055]:
        return 'Renton'
    
    elif z in [98070]:
        return 'VashonIsland'
    
    elif z in [98146, 98166, 98148, 98188]:
        return 'Highline'
    
    elif z in [98198, 98023, 98003, 98001]:
        return 'FederalWay'
    
    elif z in [98032, 98031, 98030, 98042, 98058]:
        return 'Kent'
    
    elif z in [98002, 98092]:
        return 'Auburn'
    
    elif z in [98038]:
        return 'Tahoma'
    
    elif z in [98022, 98010]:
        return 'Enumclaw'

In [149]:
df['zipcode'] = df['zipcode'].map(group_zips)

In [150]:
df['zipcode'].head()

0            Renton
1         Shoreline
2        Northshore
3           Seattle
4    LakeWashington
Name: zipcode, dtype: object

In [151]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder()
ohe = ohc.fit_transform(df.zipcode.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(ohe, columns = ['SD_' + str(ohc.categories_[0][i]) for i in range(len(ohc.categories_[0]))])
dfh = pd.concat([df, dfOneHot], axis = 1)
dfh.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,SD_MercerIsland,SD_Northshore,SD_Renton,SD_Riverview,SD_Seattle,SD_Shoreline,SD_SnoqualmieValley,SD_Tahoma,SD_Tukwila,SD_VashonIsland
0,221900.0,3.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,538000.0,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,180000.0,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,604000.0,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,510000.0,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
dfh.drop(['zipcode'], axis = 1, inplace = True)

In [153]:
dfh.isnull().sum()

price                  1345
bedrooms               1345
bathrooms              1345
sqft_living            1345
sqft_lot               1345
floors                 1345
waterfront             1345
view                   1345
condition              1345
grade                  1345
sqft_above             1345
sqft_living15          1345
sqft_lot15             1345
yrs_old                1345
renovated              1345
basement               1345
distance_from_city     2690
SD_Auburn              1345
SD_Bellevue            1345
SD_Enumclaw            1345
SD_FederalWay          1345
SD_Highline            1345
SD_Issaquah            1345
SD_Kent                1345
SD_LakeWashington      1345
SD_MercerIsland        1345
SD_Northshore          1345
SD_Renton              1345
SD_Riverview           1345
SD_Seattle             1345
SD_Shoreline           1345
SD_SnoqualmieValley    1345
SD_Tahoma              1345
SD_Tukwila             1345
SD_VashonIsland        1345
dtype: int64

In [154]:
dfh.dropna(inplace = True)

In [155]:
dfh.isnull().sum()

price                  0
bedrooms               0
bathrooms              0
sqft_living            0
sqft_lot               0
floors                 0
waterfront             0
view                   0
condition              0
grade                  0
sqft_above             0
sqft_living15          0
sqft_lot15             0
yrs_old                0
renovated              0
basement               0
distance_from_city     0
SD_Auburn              0
SD_Bellevue            0
SD_Enumclaw            0
SD_FederalWay          0
SD_Highline            0
SD_Issaquah            0
SD_Kent                0
SD_LakeWashington      0
SD_MercerIsland        0
SD_Northshore          0
SD_Renton              0
SD_Riverview           0
SD_Seattle             0
SD_Shoreline           0
SD_SnoqualmieValley    0
SD_Tahoma              0
SD_Tukwila             0
SD_VashonIsland        0
dtype: int64

In [106]:
dfh.corr().head()

Unnamed: 0,price,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_lot15,yrs_old,renovated,...,SD_LakeWashington,SD_MercerIsland,SD_Northshore,SD_Renton,SD_Riverview,SD_Shoreline,SD_SnoqualmieValley,SD_Tahoma,SD_Tukwila,SD_VashonIsland
price,1.0,0.602094,0.094643,0.282191,0.054128,0.219247,0.044266,0.082515,-0.043956,0.025278,...,0.005912,0.004805,0.000721,-0.010767,0.003276,-0.006962,0.008737,-0.007076,0.008359,0.009228
sqft_living,0.602094,1.0,0.162446,0.368986,0.001794,0.151781,-0.064781,0.177146,-0.367637,-0.004248,...,0.00681,0.004581,-0.000954,-0.010114,-0.004948,-0.01185,0.006008,-0.006124,0.00386,0.025437
sqft_lot,0.094643,0.162446,1.0,0.002711,0.025533,0.080124,-0.010569,0.703211,-0.063249,-0.008839,...,-0.000924,-0.004971,-0.005997,-0.006032,-0.011182,-0.012466,0.012329,-0.009772,-0.000851,0.006621
floors,0.282191,0.368986,0.002711,1.0,-0.002508,-0.005774,-0.246995,0.00088,-0.46477,-0.011447,...,0.011086,0.005464,-0.011157,-0.007772,-0.003265,-0.015259,0.017033,-0.008524,0.001685,0.000945
waterfront,0.054128,0.001794,0.025533,-0.002508,1.0,0.286224,0.009884,0.031863,0.032438,0.019305,...,0.005072,-0.00448,0.003689,-0.001548,0.001958,0.000428,-0.00886,-0.002905,0.002982,-0.00395


In [156]:
# Run regression model on all significant features

import statsmodels.api as sm
from statsmodels.formula.api import ols

outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,743.0
Date:,"Wed, 17 Feb 2021",Prob (F-statistic):,0.0
Time:,17:44:53,Log-Likelihood:,-247630.0
No. Observations:,18771,AIC:,495300.0
Df Residuals:,18737,BIC:,495600.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.382e+05,1.25e+04,-51.181,0.000,-6.63e+05,-6.14e+05
bedrooms,-1.348e+04,1415.209,-9.523,0.000,-1.63e+04,-1.07e+04
bathrooms,2.597e+04,2357.116,11.016,0.000,2.13e+04,3.06e+04
sqft_living,45.6192,4.551,10.025,0.000,36.700,54.539
sqft_lot,0.1431,0.033,4.347,0.000,0.079,0.208
floors,4.828e+04,2602.148,18.553,0.000,4.32e+04,5.34e+04
waterfront,1.034e+05,1.92e+04,5.374,0.000,6.57e+04,1.41e+05
view,1.692e+04,1639.757,10.320,0.000,1.37e+04,2.01e+04
condition,1.854e+04,1550.530,11.958,0.000,1.55e+04,2.16e+04

0,1,2,3
Omnibus:,440.481,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,529.826
Skew:,0.32,Prob(JB):,8.910000000000001e-116
Kurtosis:,3.518,Cond. No.,4120000000000000.0


In [157]:
dfh.drop(['renovated', 'distance_from_city'], axis = 1, inplace = True)

In [158]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,791.0
Date:,"Wed, 17 Feb 2021",Prob (F-statistic):,0.0
Time:,17:45:00,Log-Likelihood:,-247630.0
No. Observations:,18771,AIC:,495300.0
Df Residuals:,18739,BIC:,495600.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.402e+05,1.12e+04,-57.305,0.000,-6.62e+05,-6.18e+05
bedrooms,-1.349e+04,1414.857,-9.535,0.000,-1.63e+04,-1.07e+04
bathrooms,2.604e+04,2354.381,11.060,0.000,2.14e+04,3.07e+04
sqft_living,45.6135,4.550,10.024,0.000,36.695,54.532
sqft_lot,0.1428,0.033,4.340,0.000,0.078,0.207
floors,4.829e+04,2601.862,18.558,0.000,4.32e+04,5.34e+04
waterfront,1.035e+05,1.92e+04,5.378,0.000,6.58e+04,1.41e+05
view,1.694e+04,1639.357,10.336,0.000,1.37e+04,2.02e+04
condition,1.848e+04,1547.293,11.945,0.000,1.55e+04,2.15e+04

0,1,2,3
Omnibus:,440.824,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,530.401
Skew:,0.32,Prob(JB):,6.68e-116
Kurtosis:,3.518,Cond. No.,1.96e+16


In [159]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [160]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 1.68
Variance inflation factor for bathrooms: 2.96
Variance inflation factor for sqft_living: 12.76
Variance inflation factor for sqft_lot: 2.0
Variance inflation factor for floors: 1.98
Variance inflation factor for waterfront: 1.1
Variance inflation factor for view: 1.19
Variance inflation factor for condition: 1.16
Variance inflation factor for grade: 2.7
Variance inflation factor for sqft_above: 13.2
Variance inflation factor for sqft_living15: 2.6
Variance inflation factor for sqft_lot15: 2.03
Variance inflation factor for yrs_old: 1.83
Variance inflation factor for basement: 3.33
Variance inflation factor for SD_Auburn: 5.17
Variance inflation factor for SD_Bellevue: 9.08
Variance inflation factor for SD_Enumclaw: 3.5
Variance inflation factor for SD_FederalWay: 11.72
Variance inflation factor for SD_Highline: 6.34
Variance inflation factor for SD_Issaquah: 12.04
Variance inflation factor for SD_Kent: 13.47
Variance inflation factor for SD_

In [161]:
dfh.drop(['grade'], axis = 1, inplace = True)

In [162]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 1.66
Variance inflation factor for bathrooms: 2.95
Variance inflation factor for sqft_living: 12.71
Variance inflation factor for sqft_lot: 2.0
Variance inflation factor for floors: 1.96
Variance inflation factor for waterfront: 1.09
Variance inflation factor for view: 1.19
Variance inflation factor for condition: 1.16
Variance inflation factor for sqft_above: 12.94
Variance inflation factor for sqft_living15: 2.41
Variance inflation factor for sqft_lot15: 2.03
Variance inflation factor for yrs_old: 1.77
Variance inflation factor for basement: 3.3
Variance inflation factor for SD_Auburn: 2.99
Variance inflation factor for SD_Bellevue: 4.84
Variance inflation factor for SD_Enumclaw: 2.2
Variance inflation factor for SD_FederalWay: 6.14
Variance inflation factor for SD_Highline: 3.53
Variance inflation factor for SD_Issaquah: 6.25
Variance inflation factor for SD_Kent: 6.97
Variance inflation factor for SD_LakeWashington: 8.97
Variance inflation fa

In [164]:
dfh.drop(['SD_Seattle'], axis = 1, inplace = True)

In [166]:
dfhi = dfh.drop(['price'], axis = 1)

for i in range(len(dfhi.columns[:-1])):
    v = vif(np.matrix(dfhi[:-1]), i)
    print('Variance inflation factor for {}: {}'.format(dfhi.columns[i], round(v, 2)))

Variance inflation factor for bedrooms: 23.74
Variance inflation factor for bathrooms: 26.89
Variance inflation factor for sqft_living: 94.47
Variance inflation factor for sqft_lot: 2.27
Variance inflation factor for floors: 14.73
Variance inflation factor for waterfront: 1.1
Variance inflation factor for view: 1.26
Variance inflation factor for condition: 21.38
Variance inflation factor for sqft_above: 86.74
Variance inflation factor for sqft_living15: 23.87
Variance inflation factor for sqft_lot15: 2.49
Variance inflation factor for yrs_old: 5.72
Variance inflation factor for basement: 5.37
Variance inflation factor for SD_Auburn: 1.1
Variance inflation factor for SD_Bellevue: 1.19
Variance inflation factor for SD_Enumclaw: 1.06
Variance inflation factor for SD_FederalWay: 1.26
Variance inflation factor for SD_Highline: 1.13
Variance inflation factor for SD_Issaquah: 1.26
Variance inflation factor for SD_Kent: 1.3
Variance inflation factor for SD_LakeWashington: 1.4
Variance inflatio

In [165]:
outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=dfh).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.495
Model:,OLS,Adj. R-squared:,0.494
Method:,Least Squares,F-statistic:,612.7
Date:,"Wed, 17 Feb 2021",Prob (F-statistic):,0.0
Time:,17:47:59,Log-Likelihood:,-249060.0
No. Observations:,18771,AIC:,498200.0
Df Residuals:,18740,BIC:,498400.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.032e+05,8941.367,-22.725,0.000,-2.21e+05,-1.86e+05
bedrooms,-2.302e+04,1516.166,-15.184,0.000,-2.6e+04,-2.01e+04
bathrooms,3.516e+04,2535.477,13.866,0.000,3.02e+04,4.01e+04
sqft_living,61.8500,4.902,12.617,0.000,52.242,71.458
sqft_lot,0.1311,0.036,3.691,0.000,0.061,0.201
floors,6.34e+04,2793.473,22.695,0.000,5.79e+04,6.89e+04
waterfront,7.123e+04,2.08e+04,3.431,0.001,3.05e+04,1.12e+05
view,2.188e+04,1767.155,12.379,0.000,1.84e+04,2.53e+04
condition,1.901e+04,1670.323,11.381,0.000,1.57e+04,2.23e+04

0,1,2,3
Omnibus:,381.0,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,408.807
Skew:,0.341,Prob(JB):,1.69e-89
Kurtosis:,3.242,Cond. No.,1010000.0
