In [1]:
# Import necessary modules
import pandas as pd
from datetime import datetime as dt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm


In [2]:
# Homes contains data on home sales, zipcodes is for encoding cities later
homes = pd.read_csv('data/kc_house_data.csv')
zipcodes = pd.read_csv('data/zips.csv')

# Columns suggested to drop by project description
cols_to_drop = (['id','date','sqft_above','sqft_basement',
                'lat','long','sqft_living15','sqft_lot15'])
homes = homes.drop(cols_to_drop,axis=1)

# Exchange the zipcode column for a city column based on zipcode
homes = homes.merge(zipcodes,how='left',on='zipcode').drop('zipcode',axis=1)
#homes.info()

In [3]:
homes.loc[homes.bedrooms > 20,'bedrooms'] = 3

In [4]:
waterfront_rule = {'NO':0,'YES':1}
waterfronts = homes.waterfront.fillna('NO').map(waterfront_rule)
homes.waterfront = waterfronts

In [5]:
view_rule = {'NONE':0,'FAIR':1,'AVERAGE':2,'GOOD':3,'EXCELLENT':4}
views = homes.view.fillna('NONE').map(view_rule)
homes.view = views

In [6]:
condition_rule = {'Poor':0,'Fair':1,'Average':2,'Good':3,'Very Good':4}
conditions = homes.condition.map(condition_rule)
homes.condition = conditions

In [7]:
grades = homes.grade.apply(lambda x: int(x.split()[0]))
homes.grade = grades

In [8]:
homes.yr_renovated.fillna(0,inplace=True)
def since_reno(home):
    if home.yr_renovated < home.yr_built:
        return dt.today().year - home.yr_built
    else:
        return dt.today().year - home.yr_renovated
homes['since_reno'] = homes.apply(lambda x:since_reno(x),axis=1)
homes.drop('yr_renovated',inplace=True,axis=1)

In [9]:
homes['age'] = dt.today().year - homes['yr_built']
homes.drop('yr_built',inplace=True,axis=1)

In [10]:
cities = pd.get_dummies(homes.city,prefix='city',sparse=False,drop_first=True)
homes.drop('city',inplace=True,axis=1)

In [11]:
homes = pd.concat([homes,cities],axis=1)
homes = sm.add_constant(homes)
X = homes.drop('price',axis=1)
y = homes.price

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=271828)

In [12]:
regressor = LinearRegression().fit(X_train,y_train)
print(regressor.score(X_train,y_train))
print(regressor.score(X_test,y_test))

0.7277558032321243
0.7619816269525088


In [13]:
cv_regressor = LinearRegression()
cross_validate(cv_regressor,X_train,y_train)['test_score']

array([0.74265208, 0.72386329, 0.72707178, 0.70938234, 0.71854113])

In [14]:
sm_regressor = sm.OLS(y_train,X_train).fit()
sm_regressor.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.728
Model:,OLS,Adj. R-squared:,0.727
Method:,Least Squares,F-statistic:,1271.0
Date:,"Fri, 18 Feb 2022",Prob (F-statistic):,0.0
Time:,15:38:42,Log-Likelihood:,-220160.0
No. Observations:,16197,AIC:,440400.0
Df Residuals:,16162,BIC:,440700.0
Df Model:,34,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.996e+05,1.9e+04,-47.445,0.000,-9.37e+05,-8.62e+05
bedrooms,-4.094e+04,2221.791,-18.427,0.000,-4.53e+04,-3.66e+04
bathrooms,3.622e+04,3604.616,10.048,0.000,2.92e+04,4.33e+04
sqft_living,175.8060,3.478,50.552,0.000,168.989,182.623
sqft_lot,0.0494,0.039,1.260,0.208,-0.027,0.126
floors,3.079e+04,3681.201,8.365,0.000,2.36e+04,3.8e+04
waterfront,6.048e+05,2.02e+04,29.897,0.000,5.65e+05,6.44e+05
view,4.337e+04,2316.014,18.728,0.000,3.88e+04,4.79e+04
condition,2.313e+04,2631.393,8.790,0.000,1.8e+04,2.83e+04

0,1,2,3
Omnibus:,14326.831,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1794755.841
Skew:,3.732,Prob(JB):,0.0
Kurtosis:,54.026,Cond. No.,1180000.0
