In [44]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df_original = pd.read_csv('kc_house_data.csv')

In [35]:
df_original.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [37]:
features_name = ['sqft_living', 'bedrooms', 'sqft_lot', 'bathrooms', 'waterfront', 'grade']

label = df_original['price']
features = df_original[features_name]
y = np.c_[label]
X = np.c_[features]

In [38]:
linear_model = LinearRegression()
linear_model.fit(X=X, y=y)

In [39]:
# my_target_house = [
#     [65], 
#     [102], 
#     [1000]
# ]
# my_target_house = np.array([65, 102, 1000]).reshape(3,1)
# linear_model.predict(my_target_house)
# 'sqft_living', 'bedrooms', 'sqft_lot', 'bathrooms', 'waterfront', 'grade'
my_target_house = [
    [1000, 3, 0, 2, 1, 13],
    [1500, 4, 0, 2, 0, 13]
]
linear_model.predict(my_target_house)

array([[1630519.93411513],
       [ 907910.48708687]])

In [40]:
y_pred = linear_model.predict(X)

mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')

# Linear model sqft_living:
# MAE: 173688.27335953125
# MSE: 68351286833.039825
# R2: 0.4928532179037931

MAE: 158510.28950720385
MSE: 56685920169.790985
R2: 0.5794068650895444


In [41]:
mean_price = np.mean(y)
y_pred_naive = np.full(y.shape, mean_price)

mae_naive = mean_absolute_error(y, y_pred_naive)
mse_naive = mean_squared_error(y, y_pred_naive)
r2_naive = r2_score(y, y_pred_naive)
print(f'MAE naive: {mae_naive}')
print(f'MSE naive: {mse_naive}')
print(f'R2 naive: {r2_naive}')

MAE naive: 233941.72427250765
MSE naive: 134776142225.57256
R2 naive: 0.0


In [42]:
print(f'Improvments over naive: {(mse_naive-mse)/mse_naive*100:.02f}%')

Improvments over naive: 57.94%


In [50]:
knn_model = KNeighborsRegressor(n_neighbors=1)
knn_model.fit(X, y)

y_pred_knn = knn_model.predict(X)

mae_knn = mean_absolute_error(y, y_pred_knn)
mse_knn = mean_squared_error(y, y_pred_knn)
r2_knn = r2_score(y, y_pred_knn)
print(f'MAE: {mae_knn}')
print(f'MSE: {mse_knn}')
print(f'R2: {r2_knn}')

# Linear model sqft_living:
# MAE: 173688.27335953125
# MSE: 68351286833.039825
# R2: 0.4928532179037931

# Multivariate Linear model
# MAE: 158510.28950720385
# MSE: 56685920169.790985
# R2: 0.5794068650895444

# KNN n=20
# MAE: 154995.17550548282
# MSE: 60151849154.94926
# R2: 0.5794068650895444

MAE: 1610.6845417110073
MSE: 321173210.2882987
R2: 0.9976169876583145
