# Best Models for Housing Price Prediction

In [478]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
import scipy.stats as stats

In [479]:
modeling_data = pd.read_csv('../Dataset/bangkok_preprocess.csv')

In [480]:
features = sorted(list(set(modeling_data.columns)-{'price'}))

In [481]:
# Setting Our Features and Targets
X = modeling_data[features]
y = modeling_data['price']

In [482]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size= 0.75 , random_state=42)

---

### Model Tuning 

The best model is **Linear Regression model** selected from [Model_Benchmark](../Code/Model_Benchmark.ipynb).

In [486]:
lr = LinearRegression()
lr.fit(X_train,y_train)
train_score = lr.score(X_train,y_train)
test_score = lr.score(X_test,y_test)
modeling_data['predict_price'] = lr.predict(X)
y_preds = modeling_data['predict_price']
modeling_data['residuals'] = y - y_preds
residuals = modeling_data['residuals']
cvs = cross_val_score(lr, X, y, cv = 6)

In [487]:
# Function for Calculating Adjusted R-Squared
def r2_adj(y, y_preds, p):
    n = len(y)
    r2 = metrics.r2_score(y, y_preds)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

In [488]:
# Function for Calculating Metrics in the Model
def king_of_metrics(y, y_preds, p): 

    mse = metrics.mean_squared_error(y, y_preds)
    rmse = metrics.root_mean_squared_error(y, y_preds)
    mae = metrics.mean_absolute_error(y, y_preds)
    r2 = metrics.r2_score(y, y_preds)
    r2a = r2_adj(y, y_preds, p)


    print(f"Regression Metric number the features is: ")
    print(f"bedrooms, baths, floor_area, province, district, property_type, nearby_stations, nearby_supermarkets, facilities, num_facilities\n")
    print(f"The mean squared error is {mse:,.2f}")
    print(f"The root mean squared error is {rmse:,.2f}")
    print(f"The mean absolute error is {mae:,.2f}")
    print(f"The R^2 is {r2:,.2f}")
    print(f"Adjusted R^2 = {r2a:,.2f}\n")
    print(f"train score = {train_score:,.2f}")
    print(f"test score = {test_score:,.2f}\n")
    print(f"Cross Validation Score cv = 6 : {cvs}")
king_of_metrics(y,y_preds, 30)

Regression Metric number the features is: 
bedrooms, baths, floor_area, province, district, property_type, nearby_stations, nearby_supermarkets, facilities, num_facilities

The mean squared error is 1,109,382,202,376.30
The root mean squared error is 1,053,272.14
The mean absolute error is 768,382.45
The R^2 is 0.77
Adjusted R^2 = 0.77

train score = 0.78
test score = 0.74

Cross Validation Score cv = 6 : [0.75729726 0.71371282 0.76446576 0.73024996 0.73422346 0.75591775]


In [489]:
Post_model_OLS =  modeling_data[['price','predict_price','residuals']]
Post_model_OLS.to_csv('../Dataset/post_ols.csv')

---

# Test Data

In [492]:
import warnings
from pandas.errors import PerformanceWarning

# Ignore PerformanceWarning
warnings.filterwarnings("ignore", category=PerformanceWarning)


In [493]:
bangkok_test = pd.read_csv('../Dataset/test_cleaned.csv')

In [494]:
bangkok_test.head()

Unnamed: 0,id,bedrooms,baths,floor_area,nearby_stations,nearby_station_distance,nearby_supermarkets,nearby_shops,num_facilities,station_name,...,floor_area^2,floor_area nearby_stations,floor_area nearby_supermarkets,floor_area num_facilities,nearby_stations^2,nearby_stations nearby_supermarkets,nearby_stations num_facilities,nearby_supermarkets^2,nearby_supermarkets num_facilities,num_facilities^2
0,10317868,4.0,3.0,120,0,0,9.0,5,6,-,...,14400.0,0.0,1080.0,720.0,0.0,0.0,0.0,81.0,54.0,36.0
1,10885829,4.0,3.0,188,0,0,9.0,5,6,-,...,35344.0,0.0,1692.0,1128.0,0.0,0.0,0.0,81.0,54.0,36.0
2,10765951,1.0,1.0,22,3,"[['BL20 Phra Ram 9 MRT', 270], ['E4 Asok BTS',...",13.0,20,5,Phra Ram 9,...,484.0,66.0,286.0,110.0,9.0,39.0,15.0,169.0,65.0,25.0
3,10003549,1.0,1.0,41,1,"[['E6 Thong Lo BTS', 70]]",16.0,20,6,Thong Lo,...,1681.0,41.0,656.0,246.0,1.0,16.0,6.0,256.0,96.0,36.0
4,10663026,1.0,1.0,29,0,0,10.0,14,5,-,...,841.0,0.0,290.0,145.0,0.0,0.0,0.0,100.0,50.0,25.0


In [495]:
# Get all columns to features except price
test_features = list(set(bangkok_test.columns)-{'price'})

In [496]:
# # Check the columns to match the features in the training data and add a 'False' value in a new column
features_add = list(set(features) - set(test_features))
test_features = sorted(test_features + features_add)
bangkok_test[features_add] = False

In [497]:
# Write a Function to Input Data into an Already Trained Model
def predict_test_model(features,data):
    X = data[features]
    X = X.copy()
    data['price'] = lr.predict(X)
    # If the predicted value is negative, it will be turned to 0.
    data['price'] = np.where(data['price'] < 0, 0, data['price'])

    return data[['id','price']]

test_predicted_price = predict_test_model(features,bangkok_test)

In [498]:
test_predicted_price.to_csv('../Dataset/test_predict_price.csv', index = False)