In [181]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')


In [182]:
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')

In [183]:
df.head(1)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
0,1565930130,20141104T000000,4,3.25,3760,4675,2.0,0,0,3,8,2740,1020,2007,0,98038,47.3862,-122.048,3280,4033,429900.0


# Data Exploration

In [184]:
df.yr_renovated.value_counts()

0       19171
2014       86
2005       34
2003       34
2013       33
        ...  
1934        1
1951        1
1948        1
1954        1
1956        1
Name: yr_renovated, Length: 70, dtype: int64

In [185]:
df.bedrooms.value_counts()

3     9101
4     6354
2     2567
5     1478
6      251
1      179
7       36
8       12
0       11
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64

# Pre-processing

In [186]:
df = (df

    .drop(labels = 'id', axis = 1)

    .assign(date = lambda x: x.date.str.slice(start = 0, stop = 8)

     , year = lambda x: x.date.str.slice(start=0, stop=4)

     , month = lambda x: x.date.str.slice(start=4, stop = 6)

     , day = lambda x: x.date.str.slice(start=6, stop= 8))

    .drop(labels = 'date', axis = 1)

)

In [187]:
df['month'] = df['month'].astype(int)

In [188]:
df['day'] = df['day'].astype(int)

In [189]:
df['year'] = df['year'].astype(int)

In [190]:
from sklearn.preprocessing import MinMaxScaler

In [191]:
df = df.assign(lat_normalized = MinMaxScaler().fit_transform(np.array(df.lat).reshape(-1,1)))


In [192]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.90)
IQR = Q3 - Q1


In [193]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

(15901, 23)

# Model

In [194]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [195]:
target = df.price
features = df.drop(columns="price",axis=1)

In [196]:
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=.20,train_size=.80)

In [197]:
model = XGBRegressor()
model.fit(X_train, y_train)



XGBRegressor()

In [198]:
predictions = model.predict(X_test)
predictions

array([ 454448.78, 1272215.6 ,  391443.22, ...,  311545.16,  198709.1 ,
        339202.4 ], dtype=float32)

In [199]:
# Compute the Root Mean Squared Error of the predictions
from sklearn.metrics import mean_squared_error

result = mean_squared_error(y_test, predictions, squared=False)
result


82249.15357505633

In [200]:
feature_df = pd.DataFrame(

    {'features': features.columns, 'importance': model.feature_importances_})

feature_df.sort_values('importance', ascending=False).head(6)

Unnamed: 0,features,importance
8,grade,0.353618
2,sqft_living,0.233614
14,lat,0.158068
11,yr_built,0.047035
13,zipcode,0.041432
16,sqft_living15,0.03832


In [201]:
est = HistGradientBoostingRegressor(max_bins=255, n_iter_no_change=20, max_depth=250, min_samples_leaf=20, max_leaf_nodes=20, max_iter=159, random_state=200000, learning_rate=0.125).fit(X_train, y_train)

In [202]:
predictions_est = est.predict(X_test)
predictions_est

array([ 411326.89567066, 1659104.7838159 ,  396888.96468219, ...,
        298217.87920517,  199185.37314901,  312510.81695918])

In [203]:
est.score(X_train, y_train)

0.9391288041897007

In [204]:
result_est = mean_squared_error(y_test, predictions_est, squared=False)
result_est

71404.47928214207