In [105]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from joblib import dump

# House Price Prediction
Attempting to predict house price using various techniques including evaluating instance-based and model-based approaches

Dataset of daft listings from kaggle [can be found here](https://www.kaggle.com/datasets/eavannan/daftie-house-price-data/)

### Dataset Analysis

In [107]:
df = pd.read_csv("datasets/house_prices.csv")
df = df[df['propertySize'].notna()]

# shuffle dataset
df = df.sample(frac=1, random_state=2)
df.reset_index(drop=True, inplace=True)

# specify predictive features - start with basic features
features = ["numBedrooms", "numBathrooms", "propertySize"]

# split off test set
dev_df, test_df = train_test_split(df, train_size=0.8, random_state=2)
dev_X = dev_df[features]
test_X = test_df[features]
dev_y = dev_df["price"].values
test_y = test_df["price"].values

In [108]:
# split dev data into training data and validation data
ss = ShuffleSplit(n_splits=1, train_size=0.75, random_state=2)

In [109]:
# create preprocessor to standardise features
preprocessor = ColumnTransformer([("scaler", StandardScaler(), features)], remainder="passthrough")

### kNN Model

In [110]:
# pipeline to combine preprocessor with kNN where k=1
knn_model = Pipeline([("preprocessor", preprocessor), ("predictor", KNeighborsRegressor(n_neighbors=1))])

# error estimation for this model
cross_val_score(knn_model, dev_X, dev_y, scoring="neg_mean_absolute_error", cv=ss)

array([-187855.87828492])

In [111]:
# compare with different k value
knn_model = Pipeline([("preprocessor", preprocessor), ("predictor", KNeighborsRegressor(n_neighbors=2))])

# error estimation for this model
cross_val_score(knn_model, dev_X, dev_y, scoring="neg_mean_absolute_error", cv=ss)

array([-170086.34163209])

#### Grid Search
Grid search is used here to automate finding the ideal value for hyperparameter k

In [112]:
knn_model = Pipeline([("preprocessor", preprocessor), ("predictor", KNeighborsRegressor())])

params = {"predictor__n_neighbors": [x for x in range(1,60)]}

gs = GridSearchCV(knn_model, params, scoring="neg_mean_absolute_error", cv=ss, refit=True)

gs.fit(dev_X, dev_y)

In [113]:
gs.best_params_, gs.best_score_

({'predictor__n_neighbors': 42}, -142861.04850819995)

#### Evaluating on test set

In [114]:
mean_absolute_error(test_y, gs.predict(test_X))

146592.02749785947

### Linear Model

In [115]:
linear_model = Pipeline([("preprocessor", preprocessor), ("predictor", LinearRegression())])
linear_model.fit(dev_X, dev_y)

In [117]:
mean_absolute_error(test_y, linear_model.predict(test_X))

153499.39322928895

### Results
kNN model had a lower mean absolute error, so it wins.
It still has a huge mean error - €146,592 :(
We will save this model

In [121]:
knn_model.fit(df[features], df[features].values)
dump(knn_model, "models/house_model_1.pkl")

['models/house_model_1.pkl']