# Model Benchmarks
---

This notebook will looking for the best model to predict housing price by RMSC value among `OLS`, `Regularization`, `Ridge`, `Lasso`, `ElasticNet`, `Robust Regression`, `TheilSen`, `RANSAC` , `Huber`

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model  
from sklearn import metrics
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [34]:
# Avoid warning for lasso & ElasticNet Regrssion
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category = ConvergenceWarning)

In [36]:
modeling_data = pd.read_csv('../Datasets/bangkok_preprocess.csv')

In [37]:
features = sorted(list(set(modeling_data.columns)-{'price'}))

In [38]:
X = modeling_data[features]
y = modeling_data['price']

In [39]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size= 0.75 , random_state=452)

In [42]:
## Initialize regression models
ols = sklearn.linear_model.LinearRegression()
ts = sklearn.linear_model.TheilSenRegressor()
ransac = sklearn.linear_model.RANSACRegressor()
huber = sklearn.linear_model.HuberRegressor()
lasso = sklearn.linear_model.Lasso(alpha= 1)
ridge = sklearn.linear_model.Ridge(alpha = 1)
enets = sklearn.linear_model.ElasticNet(alpha = 1 , l1_ratio=0.3)

In [52]:
def score(model_list):
    # Initialize an empty list to store each model's result as a dictionary
    results = []
    
    # Iterate over each model in the list
    for model in model_list:
        # Check if the model is Ridge, Lasso, or ElasticNet
        if model in (ridge, lasso, enets):
            # Create a StandardScaler instance
            scaler = StandardScaler()
            
            # Fit the scaler on the training data and transform both training and test sets
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Fit the model to the scaled training data
            model.fit(X_train_scaled, y_train)
            
            # Predict the scaled test set
            y_preds = model.predict(X_test_scaled)
        else:
            # Fit the model to the original training data
            model.fit(X_train, y_train)
            
            # Predict the test set without scaling
            y_preds = model.predict(X_test)
        
        # Calculate R² and RMSE using y_test
        r2 = metrics.r2_score(y_test, y_preds)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_preds))
        
        # Obtain train and test scores
        train_score = model.score(X_train_scaled if model in (ridge, lasso, enets) else X_train, y_train)
        test_score = model.score(X_test_scaled if model in (ridge, lasso, enets) else X_test, y_test)
        
        # Append the result as a dictionary to the list
        results.append({
            'model': model.__class__.__name__,
            'train score': train_score,
            'test score': test_score,
            'R_square': r2,
            'RMSE (in million)': rmse / 1_000_000  # Convert RMSE to millions
        })

    # Concatenate all dictionaries into a DataFrame
    benchmark = pd.concat([pd.DataFrame([result]) for result in results], ignore_index=True)
    return benchmark

# List of models to evaluate
model_list = [ols, ts, ransac, huber, lasso, ridge, enets]  # Ensure ElasticNet is included

# Call the function with the appropriate data splits
results = score(model_list)

In [53]:
results

Unnamed: 0,model,train score,test score,R_square,RMSE (in million)
0,LinearRegression,0.767611,0.741737,0.741737,1.115288
1,TheilSenRegressor,0.225209,0.455147,0.455147,1.619927
2,RANSACRegressor,-1.754229,-0.718992,-0.718992,2.87735
3,HuberRegressor,-0.008001,-0.011714,-0.011714,2.207418
4,Lasso,0.766451,0.741139,0.741139,1.116577
5,Ridge,0.76744,0.742127,0.742127,1.114444
6,ElasticNet,0.666612,0.647636,0.647636,1.302722


In [32]:
# The best model for predicting housing prices is Linear Regression, which yields the following results:
# train score: 0.77
# test score: 0.75
# R_squared: 0.75
# RMSE (in millions): 1.115290 → This is the lowest RMSE value achieved.