# King County Housing
#### House Price Estimate

**Authors:** Hatice Kastan, Czarina Luna, Ross McKim, Weston Shuken

##### January 2022

***

![image](Images/daria-nepriakhina-LZkbXfzJK4M-unsplash.jpg)

## Overview

    Overview of our project.

## Business Problem

    Stakeholder is a real estate company.
    Business Problem is predicting price and building a house price calculator.

## Data Understanding
    Describe the data being used for this project.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pandas.api.types import is_numeric_dtype

In [2]:
raw_data = pd.read_csv('Data/kc_house_data.csv')
raw_data.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639


## Data Cleaning
    Describe and justify the process for preparing the data for analysis.

In [3]:
# Data prep and cleaning
clean_data = raw_data.copy()

# Change to datetime and add month column
clean_data['date'] = pd.to_datetime(clean_data['date'])
clean_data['month'] = pd.DatetimeIndex(clean_data['date']).month

# Change waterfront missing value to No
clean_data.loc[raw_data.waterfront.isnull(), 'waterfront'] = 'NO'
clean_data['waterfront'] = clean_data['waterfront'].apply(lambda x: 0 if x == 'NO' else 1)

# Change view missing value to None
clean_data.loc[raw_data.view.isnull(), 'view'] = "NONE"
view_dict = {'NONE':0, 'FAIR':1, 'AVERAGE':2, 'GOOD':3, 'EXCELLENT':4}
clean_data['view'].replace(view_dict, inplace=True)

# Change condition to numerical value
cond_dict = {'Poor':0, 'Fair':1, 'Average':2, 'Good':3, 'Very Good':4}
clean_data['condition'].replace(cond_dict, inplace=True)

# Change grade to numerical value
clean_data['grade'] = clean_data['grade'].map(lambda x: int(x.split(' ')[0]))

# Add has_basement column
clean_data['sqft_basement'] = clean_data['sqft_basement'].replace('?', '0').astype('float')
clean_data['basement'] = clean_data['sqft_basement'].apply(lambda x: 0 if x == 0 else 1)

# Change some yr_renovated missing value to 0 and add renovated column
clean_data.loc[raw_data.yr_renovated.isnull(), 'yr_renovated'] = 0
clean_data['renovated'] = clean_data['yr_renovated'].apply(lambda x: 0 if x == 0 else 1)

# Add house_age column
clean_data['age'] = clean_data['date'].dt.year - clean_data['yr_built']

In [4]:
def corr_check(df, threshold):
    '''
    Enter dataframe and threshold for correlation
    Returns table of the highly correlated pairs
    '''
    corr_df = df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
    corr_df['pairs'] = list(zip(corr_df.level_0, corr_df.level_1))
    corr_df.set_index(['pairs'], inplace = True)
    corr_df.drop(columns=['level_1', 'level_0'], inplace = True)
    corr_df.columns = ['cc']
    corr_df = corr_df.drop_duplicates()
    corr_df = corr_df[(corr_df['cc'] > threshold) & (corr_df['cc'] < 1)]
    return corr_df

corr_check(clean_data, .7)

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1
"(yr_renovated, renovated)",0.999968
"(yr_built, age)",0.999873
"(sqft_living, sqft_above)",0.876448
"(basement, sqft_basement)",0.820893
"(grade, sqft_living)",0.762779
"(sqft_living15, sqft_living)",0.756402
"(grade, sqft_above)",0.756073
"(bathrooms, sqft_living)",0.755758
"(sqft_living15, sqft_above)",0.731767
"(sqft_lot15, sqft_lot)",0.718204


In [5]:
# Drop columns
clean_data = clean_data.drop(columns=['id', 'date', 'yr_renovated', 'sqft_above', 'sqft_basement',
                      'yr_built', 'yr_renovated'])

In [6]:
clean_data.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age
0,221900.0,3,1.0,1180,5650,1.0,0,0,2,7,98178,47.5112,-122.257,1340,5650,10,0,0,59
1,538000.0,3,2.25,2570,7242,2.0,0,0,2,7,98125,47.721,-122.319,1690,7639,12,1,1,63


## Data Exploration
    Generate insights and visualizations about price and its relationships with variables.

In [7]:
clean_data.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,540296.6,3.3732,2.115826,2080.32185,15099.41,1.494096,0.00676,0.233181,2.409825,7.657915,98077.951845,47.560093,-122.213982,1986.620318,12758.283512,6.573969,0.3851,0.034449,43.323286
std,367368.1,0.926299,0.768984,918.106125,41412.64,0.539683,0.081944,0.764673,0.650546,1.1732,53.513072,0.138552,0.140724,685.230472,27274.44195,3.115061,0.48663,0.182384,29.377285
min,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,0.0,3.0,98001.0,47.1559,-122.519,399.0,651.0,1.0,0.0,0.0,-1.0
25%,322000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,2.0,7.0,98033.0,47.4711,-122.328,1490.0,5100.0,4.0,0.0,0.0,18.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,2.0,7.0,98065.0,47.5718,-122.231,1840.0,7620.0,6.0,0.0,0.0,40.0
75%,645000.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,3.0,8.0,98118.0,47.678,-122.125,2360.0,10083.0,9.0,1.0,0.0,63.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,4.0,13.0,98199.0,47.7776,-121.315,6210.0,871200.0,12.0,1.0,1.0,115.0


In [8]:
clean_data.loc[(clean_data.bedrooms == 33), 'bedrooms'] = 3

## Feature Engineering
    Create new variables to predict the price.

In [9]:
from haversine import haversine, Unit

seattle = (47.608013, -122.335167)
redmond = (47.673988, -122.121513)

def get_dist(center, lat, long):
    house = (lat, long)
    return haversine(center, house, unit=Unit.MILES)

feature_data = clean_data.copy()

# Add distance_from_seattle column
feature_data['distance'] = clean_data.apply(lambda x: get_dist(seattle, x.lat, x.long), axis=1)

# Add distance_from_redmond column
feature_data['dist_redmond'] = clean_data.apply(lambda x: get_dist(redmond, x.lat, x.long), axis=1)

### Feature Scaling
    Perform log transformation and standardization.

In [10]:
# Log Transform
log_transform = ['sqft_living', 'distance', 'dist_redmond']
log_data = feature_data.copy()

for col in log_transform:
    log_data[col] = np.log(log_data[col])

In [11]:
# from sklearn.preprocessing import StandardScaler 
# scaler = StandardScaler()

# X_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)

# model = sm.OLS(y_train, sm.add_constant(X_scaled)).fit()
# model.summary()

## Data Modeling
    Describe and justify the process for modeling the data.
    Run multiple linear regression on top ranking features.

In [12]:
lr = LinearRegression()

In [13]:
def get_y_X(data, target):
    y = data[target] 
    X = data.drop(target, axis=1)
    return y, X

In [14]:
def train_test(y, X):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
    return X_train, X_test, y_train, y_test

In [15]:
# def get_coef()

In [16]:
def model_summary(y, X):
    model = sm.OLS(y, sm.add_constant(X)).fit()
    return model.summary()

In [17]:
def get_metrics(y, X):
    model = sm.OLS(y, sm.add_constant(X)).fit()
    rsquared = model.rsquared
    adj_r = model.rsquared_adj
    mse = model.mse_resid
    rmse = np.sqrt(mse)
    return rsquared, adj_r, mse, rmse

In [18]:
def train_test_compare(X_tr, X_te, y_tr, y_te):
    model = lr.fit(X_tr, y_tr)
    
    train_score = lr.score(X_tr, y_tr)
    test_score = lr.score(X_te, y_te)
    
    y_hat_train = lr.predict(X_tr)
    y_hat_test = lr.predict(X_te)
    
    train_rmse = np.sqrt(mean_squared_error(y_tr, y_hat_train))
    test_rmse = np.sqrt(mean_squared_error(y_te, y_hat_test))
    
    print(f'Training data R2: {train_score}\nTesting data R2: {test_score}\nTraining data RMSE: {train_rmse}\nTesting data RMSE: {test_rmse}')

### Baseline Model
    Run simple linear regression on feature highest correlated with price.

In [19]:
baseline = raw_data.drop(columns=['date', 'waterfront', 'view', 'condition', 'grade', 'sqft_basement']).dropna()

y, X = get_y_X(baseline, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.6319749427235021
Testing data R2: 0.6181910518762154
Training data RMSE: 226292.82659600404
Testing data RMSE: 221129.32483148924


In [20]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.632
Model:,OLS,Adj. R-squared:,0.632
Method:,Least Squares,F-statistic:,1631.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:00,Log-Likelihood:,-183080.0
No. Observations:,13316,AIC:,366200.0
Df Residuals:,13301,BIC:,366300.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.212e+07,4.1e+06,-2.959,0.003,-2.02e+07,-4.09e+06
id,-6.141e-08,6.91e-07,-0.089,0.929,-1.42e-06,1.29e-06
bedrooms,-6.865e+04,2785.413,-24.647,0.000,-7.41e+04,-6.32e+04
bathrooms,6.527e+04,4660.696,14.004,0.000,5.61e+04,7.44e+04
sqft_living,238.6753,6.043,39.496,0.000,226.830,250.520
sqft_lot,0.1020,0.069,1.476,0.140,-0.034,0.238
floors,1.488e+04,5129.298,2.901,0.004,4827.575,2.49e+04
sqft_above,35.3143,6.060,5.828,0.000,23.437,47.192
yr_built,-2382.3015,95.884,-24.846,0.000,-2570.247,-2194.356

0,1,2,3
Omnibus:,11106.714,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,836006.897
Skew:,3.549,Prob(JB):,0.0
Kurtosis:,41.163,Cond. No.,11300000000000.0


##### Model Metrics Table
    Create table of metrics we care about, and update with every additional model after.

In [21]:
metric_table = pd.DataFrame(columns = ['model', 'rsquared', 'adj_rsquared', 'mse', 'rmse'])

In [22]:
def update_metrics(model_name, y, X):
    rsquared, adj_r, mse, rmse = get_metrics(y, X)
    return metric_table.append({'model': model_name, 'rsquared': rsquared, 'adj_rsquared': adj_r, 'mse': mse, 'rmse': rmse}, ignore_index = True)

In [23]:
metric_table = update_metrics('Preprocessed Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785


### Clean Data

In [24]:
# Run linear regression on clean data

y, X = get_y_X(clean_data, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.704211962008072
Testing data R2: 0.6907747019134008
Training data RMSE: 199235.56620994594
Testing data RMSE: 205974.80762608827


In [25]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.704
Model:,OLS,Adj. R-squared:,0.704
Method:,Least Squares,F-statistic:,2140.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:00,Log-Likelihood:,-220620.0
No. Observations:,16197,AIC:,441300.0
Df Residuals:,16178,BIC:,441400.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.119e+05,3.31e+06,0.245,0.806,-5.68e+06,7.31e+06
bedrooms,-3.693e+04,2259.080,-16.349,0.000,-4.14e+04,-3.25e+04
bathrooms,4.306e+04,3791.358,11.356,0.000,3.56e+04,5.05e+04
sqft_living,166.6275,3.946,42.230,0.000,158.894,174.362
sqft_lot,0.0918,0.055,1.669,0.095,-0.016,0.200
floors,1.342e+04,3960.996,3.388,0.001,5654.704,2.12e+04
waterfront,6.271e+05,2.06e+04,30.385,0.000,5.87e+05,6.68e+05
view,5.056e+04,2410.439,20.974,0.000,4.58e+04,5.53e+04
condition,2.852e+04,2677.751,10.649,0.000,2.33e+04,3.38e+04

0,1,2,3
Omnibus:,13301.078,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1080776.024
Skew:,3.434,Prob(JB):,0.0
Kurtosis:,42.424,Cond. No.,213000000.0


In [26]:
metric_table = update_metrics('Clean Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785
1,Clean Data,0.704212,0.703883,39741430000.0,199352.526433


### Feature Engineering Data

In [27]:
# Run linear regression on feature engineered data
y, X = get_y_X(feature_data, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.7390806134377373
Testing data R2: 0.7249719797557668
Training data RMSE: 187124.10641765405
Testing data RMSE: 194251.80760229845


In [28]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.739
Model:,OLS,Adj. R-squared:,0.739
Method:,Least Squares,F-statistic:,2291.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:00,Log-Likelihood:,-219610.0
No. Observations:,16197,AIC:,439300.0
Df Residuals:,16176,BIC:,439400.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.355e+08,4.63e+06,29.285,0.000,1.26e+08,1.45e+08
bedrooms,-3.332e+04,2128.223,-15.655,0.000,-3.75e+04,-2.91e+04
bathrooms,3.459e+04,3566.637,9.698,0.000,2.76e+04,4.16e+04
sqft_living,179.1706,3.716,48.216,0.000,171.887,186.454
sqft_lot,0.1728,0.052,3.340,0.001,0.071,0.274
floors,-2.665e+04,3875.940,-6.877,0.000,-3.43e+04,-1.91e+04
waterfront,6.624e+05,1.94e+04,34.066,0.000,6.24e+05,7.01e+05
view,4.917e+04,2271.422,21.649,0.000,4.47e+04,5.36e+04
condition,3.051e+04,2516.202,12.126,0.000,2.56e+04,3.54e+04

0,1,2,3
Omnibus:,13874.848,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1364739.182
Skew:,3.616,Prob(JB):,0.0
Kurtosis:,47.383,Cond. No.,316000000.0


In [29]:
metric_table = update_metrics('Distance to Cities Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785
1,Clean Data,0.704212,0.703883,39741430000.0,199352.526433
2,Distance to Cities Data,0.739081,0.738758,35060890000.0,187245.531111


#### Log Transform Data

In [30]:
# Run linear regression on log transformed data
y, X = get_y_X(log_data, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.7137096054016371
Testing data R2: 0.6854924958149243
Training data RMSE: 196010.77852200627
Testing data RMSE: 207726.59561094662


In [31]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.714
Model:,OLS,Adj. R-squared:,0.713
Method:,Least Squares,F-statistic:,2016.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:00,Log-Likelihood:,-220360.0
No. Observations:,16197,AIC:,440800.0
Df Residuals:,16176,BIC:,440900.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.373e+07,3.81e+06,19.346,0.000,6.63e+07,8.12e+07
bedrooms,-1.339e+04,2323.613,-5.763,0.000,-1.79e+04,-8836.542
bathrooms,7.675e+04,3675.574,20.882,0.000,6.95e+04,8.4e+04
sqft_living,1.54e+05,8565.225,17.976,0.000,1.37e+05,1.71e+05
sqft_lot,0.3028,0.054,5.599,0.000,0.197,0.409
floors,-4.005e+04,4130.984,-9.694,0.000,-4.81e+04,-3.19e+04
waterfront,7.006e+05,2.03e+04,34.436,0.000,6.61e+05,7.4e+05
view,5.553e+04,2372.567,23.404,0.000,5.09e+04,6.02e+04
condition,3.127e+04,2639.153,11.847,0.000,2.61e+04,3.64e+04

0,1,2,3
Omnibus:,16537.094,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3088764.682
Skew:,4.653,Prob(JB):,0.0
Kurtosis:,70.009,Cond. No.,249000000.0


In [32]:
metric_table = update_metrics('Logarithm Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785
1,Clean Data,0.704212,0.703883,39741430000.0,199352.526433
2,Distance to Cities Data,0.739081,0.738758,35060890000.0,187245.531111
3,Logarithm Data,0.71371,0.713356,38470100000.0,196137.969771


#### Data with Categorical Variables

In [33]:
# Get dummies
zipcode_dummies = pd.get_dummies(feature_data['zipcode'], drop_first=True)
waterfront_dummies = pd.get_dummies(feature_data['waterfront'], drop_first=True)
view_dummies = pd.get_dummies(feature_data['view'], drop_first=True)
month_dummies = pd.get_dummies(feature_data['month'], drop_first=True)

dummy_data = pd.concat([feature_data, waterfront_dummies, view_dummies, month_dummies, zipcode_dummies], axis=1)
dummy_data.drop(columns=['zipcode', 'waterfront', 'view', 'month'], inplace=True)

In [34]:
# Run linear regression on categorical data

y, X = get_y_X(dummy_data, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.81572383248587
Testing data R2: 0.7986094792214566
Training data RMSE: 157257.4216323339
Testing data RMSE: 166224.91133449395


In [35]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.816
Model:,OLS,Adj. R-squared:,0.815
Method:,Least Squares,F-statistic:,705.4
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:03,Log-Likelihood:,-216790.0
No. Observations:,16197,AIC:,433800.0
Df Residuals:,16095,BIC:,434600.0
Df Model:,101,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.575e+07,9.99e+06,6.581,0.000,4.62e+07,8.53e+07
bedrooms,-2.758e+04,1814.689,-15.200,0.000,-3.11e+04,-2.4e+04
bathrooms,2.653e+04,3030.438,8.755,0.000,2.06e+04,3.25e+04
sqft_living,185.1143,3.194,57.964,0.000,178.855,191.374
sqft_lot,0.2188,0.044,4.994,0.000,0.133,0.305
floors,-3.941e+04,3463.280,-11.380,0.000,-4.62e+04,-3.26e+04
condition,2.553e+04,2178.583,11.718,0.000,2.13e+04,2.98e+04
grade,6.234e+04,2047.470,30.449,0.000,5.83e+04,6.64e+04
lat,3.735e+04,8.31e+04,0.449,0.653,-1.26e+05,2e+05

0,1,2,3
Omnibus:,15544.991,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3201962.794
Skew:,4.124,Prob(JB):,0.0
Kurtosis:,71.385,Cond. No.,401000000.0


In [36]:
metric_table = update_metrics('Categorical Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785
1,Clean Data,0.704212,0.703883,39741430000.0,199352.526433
2,Distance to Cities Data,0.739081,0.738758,35060890000.0,187245.531111
3,Logarithm Data,0.71371,0.713356,38470100000.0,196137.969771
4,Categorical Data,0.815724,0.814567,24886620000.0,157754.934026


### Luxury Houses

In [37]:
# # Run linear regression on luxury houses
# luxury = dummy_data.loc[(dummy_data['price'] >= 1_000_000)]

# y, X = get_y_X(luxury, 'price')
# X_train, X_test, y_train, y_test = train_test(y, X)
# train_test_compare(X_train, X_test, y_train, y_test)

In [38]:
# Run linear regression on luxury houses
luxury = dummy_data.loc[(dummy_data['grade'] >= 10)]

y, X = get_y_X(luxury, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.8218342239562322
Testing data R2: 0.7180003760666636
Training data RMSE: 305439.03152980853
Testing data RMSE: 347783.9271802332


In [39]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.822
Model:,OLS,Adj. R-squared:,0.807
Method:,Least Squares,F-statistic:,56.15
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:05,Log-Likelihood:,-17223.0
No. Observations:,1226,AIC:,34630.0
Df Residuals:,1132,BIC:,35120.0
Df Model:,93,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.092e+08,9.8e+07,-1.115,0.265,-3.02e+08,8.31e+07
bedrooms,-5.86e+04,1.42e+04,-4.129,0.000,-8.64e+04,-3.08e+04
bathrooms,1.168e+05,1.82e+04,6.418,0.000,8.11e+04,1.53e+05
sqft_living,266.5405,15.896,16.768,0.000,235.352,297.729
sqft_lot,0.5502,0.194,2.838,0.005,0.170,0.931
floors,-7.864e+04,2.76e+04,-2.850,0.004,-1.33e+05,-2.45e+04
condition,5.275e+04,2.34e+04,2.256,0.024,6880.547,9.86e+04
grade,1.415e+05,1.96e+04,7.235,0.000,1.03e+05,1.8e+05
lat,1.036e+06,7.34e+05,1.411,0.158,-4.04e+05,2.48e+06

0,1,2,3
Omnibus:,395.43,Durbin-Watson:,1.953
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3363.879
Skew:,1.245,Prob(JB):,0.0
Kurtosis:,10.724,Cond. No.,1.15e+16


In [40]:
metric_table = update_metrics('Luxury Houses Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785
1,Clean Data,0.704212,0.703883,39741430000.0,199352.526433
2,Distance to Cities Data,0.739081,0.738758,35060890000.0,187245.531111
3,Logarithm Data,0.71371,0.713356,38470100000.0,196137.969771
4,Categorical Data,0.815724,0.814567,24886620000.0,157754.934026
5,Luxury Houses Data,0.821834,0.807197,101039900000.0,317867.814313


### Non-Luxury Houses

In [41]:
# # Run linear regression on feature engineered data
# non_luxury = dummy_data.loc[(dummy_data['price'] < 1_000_000)]

# y, X = get_y_X(non_luxury, 'price')
# X_train, X_test, y_train, y_test = train_test(y, X)
# train_test_compare(X_train, X_test, y_train, y_test)

In [42]:
# Run linear regression on feature engineered data
non_luxury = dummy_data.loc[(dummy_data['grade'] < 10)]

y, X = get_y_X(non_luxury, 'price')
X_train, X_test, y_train, y_test = train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

Training data R2: 0.8075227458936017
Testing data R2: 0.8057088370981149
Training data RMSE: 106703.86681404996
Testing data RMSE: 107773.67698077479


In [43]:
model_summary(y_train, X_train)

0,1,2,3
Dep. Variable:,price,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,617.6
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,20:02:06,Log-Likelihood:,-194570.0
No. Observations:,14971,AIC:,389400.0
Df Residuals:,14869,BIC:,390100.0
Df Model:,101,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.049e+07,7.03e+06,8.610,0.000,4.67e+07,7.43e+07
bedrooms,-8830.3065,1308.251,-6.750,0.000,-1.14e+04,-6265.972
bathrooms,1.329e+04,2187.926,6.073,0.000,8997.997,1.76e+04
sqft_living,123.1117,2.498,49.288,0.000,118.216,128.008
sqft_lot,0.2693,0.035,7.769,0.000,0.201,0.337
floors,-1.168e+04,2482.764,-4.706,0.000,-1.65e+04,-6816.165
condition,2.783e+04,1511.523,18.411,0.000,2.49e+04,3.08e+04
grade,4.274e+04,1616.036,26.448,0.000,3.96e+04,4.59e+04
lat,-8.002e+04,6e+04,-1.333,0.182,-1.98e+05,3.76e+04

0,1,2,3
Omnibus:,7581.362,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,184731.588
Skew:,1.913,Prob(JB):,0.0
Kurtosis:,19.778,Cond. No.,354000000.0


In [44]:
metric_table = update_metrics('Non-Luxury Houses Data', y_train, X_train)
metric_table

Unnamed: 0,model,rsquared,adj_rsquared,mse,rmse
0,Preprocessed Data,0.631975,0.631588,51266190000.0,226420.389785
1,Clean Data,0.704212,0.703883,39741430000.0,199352.526433
2,Distance to Cities Data,0.739081,0.738758,35060890000.0,187245.531111
3,Logarithm Data,0.71371,0.713356,38470100000.0,196137.969771
4,Categorical Data,0.815724,0.814567,24886620000.0,157754.934026
5,Luxury Houses Data,0.821834,0.807197,101039900000.0,317867.814313
6,Non-Luxury Houses Data,0.807523,0.806215,11463820000.0,107069.230749


#### Check Assumptions of Linear Regression
    Linearity, independence, normality, homoescadicity

In [45]:
# code

### Feature Selection
    Feature ranking with recursive feature elimination.