In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
%matplotlib inline

In [2]:
# Homes contains data on home sales, zipcodes is for encoding cities later
homes = pd.read_csv('data/kc_house_data.csv')
zipcodes = pd.read_csv('data/zips.csv')

# Columns suggested to drop by project description
cols_to_drop = (['id','date','view','sqft_above','sqft_basement','yr_renovated',
                'lat','long','sqft_living15','sqft_lot15'])
homes = homes.drop(cols_to_drop,axis=1)

# Exchange the zipcode column for a city column based on zipcode
homes = homes.merge(zipcodes,how='left',on='zipcode').drop('zipcode',axis=1)
homes.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,yr_built,city
0,221900.0,3,1.0,1180,5650,1.0,,Average,7 Average,1955,Seattle
1,538000.0,3,2.25,2570,7242,2.0,NO,Average,7 Average,1951,Seattle
2,180000.0,2,1.0,770,10000,1.0,NO,Average,6 Low Average,1933,Kenmore
3,604000.0,4,3.0,1960,5000,1.0,NO,Very Good,7 Average,1965,Seattle
4,510000.0,3,2.0,1680,8080,1.0,NO,Average,8 Good,1987,Sammamish


In [3]:
# Check for N/A values
homes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21597 entries, 0 to 21596
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        21597 non-null  float64
 1   bedrooms     21597 non-null  int64  
 2   bathrooms    21597 non-null  float64
 3   sqft_living  21597 non-null  int64  
 4   sqft_lot     21597 non-null  int64  
 5   floors       21597 non-null  float64
 6   waterfront   19221 non-null  object 
 7   condition    21597 non-null  object 
 8   grade        21597 non-null  object 
 9   yr_built     21597 non-null  int64  
 10  city         21597 non-null  object 
dtypes: float64(3), int64(4), object(4)
memory usage: 2.0+ MB


In [4]:
# Waterfront is the only column with N/A values,
homes.waterfront.value_counts()

NO     19075
YES      146
Name: waterfront, dtype: int64

In [5]:
# Replacing N/A with the mode
homes.waterfront.fillna('NO',inplace=True)

In [6]:
#Check basic statistics on the homes data
homes.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,yr_built
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,540296.6,3.3732,2.115826,2080.32185,15099.41,1.494096,1970.999676
std,367368.1,0.926299,0.768984,918.106125,41412.64,0.539683,29.375234
min,78000.0,1.0,0.5,370.0,520.0,1.0,1900.0
25%,322000.0,3.0,1.75,1430.0,5040.0,1.0,1951.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,1975.0
75%,645000.0,4.0,2.5,2550.0,10685.0,2.0,1997.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,2015.0


In [7]:
# One home has 33 bedrooms and only 1620 sq_ft living
homes[homes.bedrooms > 8]

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,yr_built,city
4092,599999.0,9,4.5,3830,6988,2.5,NO,Average,7 Average,1938,Seattle
4231,700000.0,9,3.0,3680,4400,2.0,NO,Average,7 Average,1908,Seattle
6073,1280000.0,9,4.5,3650,5000,2.0,NO,Average,8 Good,1915,Seattle
8537,450000.0,9,7.5,4050,6504,2.0,NO,Average,7 Average,1996,Seattle
8748,520000.0,11,3.0,3000,4960,2.0,NO,Average,7 Average,1918,Seattle
13301,1150000.0,10,5.25,4590,10920,1.0,NO,Average,9 Better,2008,Bellevue
15147,650000.0,10,2.0,3610,11914,2.0,NO,Good,7 Average,1958,Bellevue
15856,640000.0,33,1.75,1620,6000,1.0,NO,Very Good,7 Average,1947,Seattle
16830,1400000.0,9,4.0,4620,5508,2.5,NO,Average,11 Excellent,1915,Seattle
18428,934000.0,9,3.0,2820,4480,2.0,NO,Average,7 Average,1918,Seattle


In [8]:
# Treating the above case as a data entry issue and replacing it with the median 3
homes.loc[homes.bedrooms > 20,'bedrooms'] = 3

In [9]:
# Checking the values for column with object dtype
homes.grade.value_counts()

7 Average        8974
8 Good           6065
9 Better         2615
6 Low Average    2038
10 Very Good     1134
11 Excellent      399
5 Fair            242
12 Luxury          89
4 Low              27
13 Mansion         13
3 Poor              1
Name: grade, dtype: int64

In [10]:
# There is an ordered numeric in the column so this extracts it
homes.grade = homes.grade.apply(lambda x: int(x.split()[0]))

In [11]:
# Replaced the grade with the numeric value from above
homes.grade.value_counts()

7     8974
8     6065
9     2615
6     2038
10    1134
11     399
5      242
12      89
4       27
13      13
3        1
Name: grade, dtype: int64

In [12]:
# Waterfront is binary so I encode 1 = Yes, 0 = No
Ord = OrdinalEncoder()
waterfronts = Ord.fit_transform(homes[['waterfront']])

# Default output is np array
waterfronts = pd.DataFrame(waterfronts,columns=['waterfront'])

# Remove the un-encoded column from dataset
homes = homes.drop('waterfront',axis=1)

In [13]:
# Checking the values for column with object dtype
homes.condition.value_counts()

Average      14020
Good          5677
Very Good     1701
Fair           170
Poor            29
Name: condition, dtype: int64

In [14]:
# Condition is categorical so I one hot encode
conditions = pd.get_dummies(homes.condition, prefix='cond',drop_first=True).astype(int)
conditions.columns = conditions.columns.str.replace(" ","")

# Remove the un-encoded column from dataset
homes = homes.drop('condition',axis=1)


In [15]:
# Checking the values for column with object dtype
homes.city.value_counts()

Seattle          8973
Renton           1597
Bellevue         1407
Kent             1201
Redmond           977
Kirkland          977
Auburn            911
Sammamish         800
Federal_Way       779
Issaquah          733
Maple_Valley      589
Woodinville       471
Snoqualmie        308
Kenmore           283
Mercer_Island     282
Enumclaw          233
North_Bend        220
Bothell           195
Duvall            190
Carnation         124
Vashon            117
Black_Diamond     100
Fall_City          80
Medina             50
Name: city, dtype: int64

In [16]:
# City is categorical so I one hot encode
cities = pd.get_dummies(homes.city,prefix='city',drop_first=True).astype(int)

# Remove the un-encoded column from dataset
homes = homes.drop('city',axis=1)

In [17]:
# All of the categoricals dropped and string columns changed to numeric
homes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21597 entries, 0 to 21596
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        21597 non-null  float64
 1   bedrooms     21597 non-null  int64  
 2   bathrooms    21597 non-null  float64
 3   sqft_living  21597 non-null  int64  
 4   sqft_lot     21597 non-null  int64  
 5   floors       21597 non-null  float64
 6   grade        21597 non-null  int64  
 7   yr_built     21597 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 1.5 MB


In [18]:
#pd.plotting.scatter_matrix(homes,figsize=(20,20));

In [19]:
#sns.heatmap(homes.corr())

In [20]:
# Re-add all of the categorical columns in their encoded forms
homes = pd.concat([homes,waterfronts,conditions,cities],axis=1)
homes = sm.add_constant(homes)

In [21]:
# Lots of encoded categoricals
homes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21597 entries, 0 to 21596
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   const               21597 non-null  float64
 1   price               21597 non-null  float64
 2   bedrooms            21597 non-null  int64  
 3   bathrooms           21597 non-null  float64
 4   sqft_living         21597 non-null  int64  
 5   sqft_lot            21597 non-null  int64  
 6   floors              21597 non-null  float64
 7   grade               21597 non-null  int64  
 8   yr_built            21597 non-null  int64  
 9   waterfront          21597 non-null  float64
 10  cond_Fair           21597 non-null  int64  
 11  cond_Good           21597 non-null  int64  
 12  cond_Poor           21597 non-null  int64  
 13  cond_VeryGood       21597 non-null  int64  
 14  city_Bellevue       21597 non-null  int64  
 15  city_Black_Diamond  21597 non-null  int64  
 16  city

In [22]:
X = homes.drop('price',axis=1)
y = homes.price

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=75,random_state=42)

In [23]:
# SKLearn model
sk_model = LinearRegression().fit(X_train,y_train)
print(sk_model.score(X_train,y_train))
print(sk_model.score(X_test,y_test))

0.8682783972638124
0.5866712367848487


In [24]:
# Statsmodels model
sm_model = sm.OLS(y_train,X_train).fit()
sm_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.868
Model:,OLS,Adj. R-squared:,0.797
Method:,Least Squares,F-statistic:,12.17
Date:,"Thu, 17 Feb 2022",Prob (F-statistic):,1.84e-13
Time:,16:08:14,Log-Likelihood:,-1001.6
No. Observations:,75,AIC:,2057.0
Df Residuals:,48,BIC:,2120.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.621e+06,2.21e+06,2.540,0.014,1.17e+06,1.01e+07
bedrooms,-5.615e+04,3.97e+04,-1.416,0.163,-1.36e+05,2.36e+04
bathrooms,6.11e+04,6.32e+04,0.967,0.339,-6.6e+04,1.88e+05
sqft_living,77.2603,60.850,1.270,0.210,-45.086,199.607
sqft_lot,0.9678,1.489,0.650,0.519,-2.026,3.962
floors,2478.3088,6.26e+04,0.040,0.969,-1.23e+05,1.28e+05
grade,1.712e+05,3.82e+04,4.485,0.000,9.44e+04,2.48e+05
yr_built,-3389.4936,1147.355,-2.954,0.005,-5696.405,-1082.582
waterfront,1.637e+06,2.4e+05,6.833,0.000,1.16e+06,2.12e+06

0,1,2,3
Omnibus:,22.824,Durbin-Watson:,2.085
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.196
Skew:,1.131,Prob(JB):,3.08e-09
Kurtosis:,5.725,Cond. No.,1.16e+16
