# Model Benchmarks

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model  
from sklearn import metrics
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [101]:
# Avoid warning for lasso & ElasticNet Regrssion
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category = ConvergenceWarning)

In [103]:
modeling_data = pd.read_csv('../Dataset/bangkok_preprocess.csv')

In [105]:
features = sorted(list(set(modeling_data.columns)-{'price'}))

In [107]:
X = modeling_data[features]
y = modeling_data['price']

In [140]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size= 0.75 , random_state=452)

In [111]:
## Initialize regression models
ols = sklearn.linear_model.LinearRegression()
ts = sklearn.linear_model.TheilSenRegressor()
ransac = sklearn.linear_model.RANSACRegressor()
huber = sklearn.linear_model.HuberRegressor()
lasso = sklearn.linear_model.Lasso(alpha= 1)
ridge = sklearn.linear_model.Ridge(alpha = 1)
enets = sklearn.linear_model.ElasticNet(alpha = 1 , l1_ratio=0.5)

In [134]:
# Function to evaluate a list of regression models
def score(model_list, X_train, X_test, y_train, y_test):
    # Initialize a DataFrame to store the results
    benchmark = pd.DataFrame(columns=['model', 'train score', 'test score', 'R_square', 'RMSE (M)'])
    
    # Iterate over each model in the list
    for model in model_list:
        try:
            # Fit the model to the training data
            model.fit(X_train, y_train)
            
            # Predict the test set
            y_preds = model.predict(X_test)
            
            # Calculate R² and RMSE using y_test
            r2 = metrics.r2_score(y_test, y_preds)
            rmse = np.sqrt(metrics.mean_squared_error(y_test, y_preds))
            
            # Obtain train and test scores
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            
            # Create a dictionary to store the results
            results_dict = {
                'model': model.__class__.__name__,
                'train score': train_score,
                'test score': test_score,
                'R_square': r2,
                'RMSE (M)': rmse / 1_000_000  # Convert RMSE to millions
            }
            
            # Convert the dictionary into a DataFrame with one row
            results_df = pd.DataFrame([results_dict])
            
            # Concatenate the new row to the benchmark DataFrame
            benchmark = pd.concat([benchmark, results_df], ignore_index=True)

        except Exception as e:
            print(f"Model {model.__class__.__name__} failed to fit: {e}")

    return benchmark

# List of models to evaluate
model_list = [ols, ts, ransac, huber, lasso, ridge, enets]

# Call the function with the appropriate data splits
results = score(model_list, X_train, X_test, y_train, y_test)

# Display the evaluation results
print(results)

  benchmark = pd.concat([benchmark, results_df], ignore_index=True)
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


               model  train score  test score  R_square  RMSE (M)
0   LinearRegression     0.776155    0.750710  0.750710  1.093502
1  TheilSenRegressor     0.157617    0.570344  0.570344  1.435581
2    RANSACRegressor    -7.326731   -1.110895 -1.110895  3.182001
3     HuberRegressor     0.016635    0.016085  0.016085  2.172430
4              Lasso     0.774800    0.749105  0.749105  1.097016
5              Ridge     0.776021    0.751339  0.751339  1.092120
6         ElasticNet     0.668122    0.645511  0.645511  1.303973


In [136]:
results

Unnamed: 0,model,train score,test score,R_square,RMSE (M)
0,LinearRegression,0.776155,0.75071,0.75071,1.093502
1,TheilSenRegressor,0.157617,0.570344,0.570344,1.435581
2,RANSACRegressor,-7.326731,-1.110895,-1.110895,3.182001
3,HuberRegressor,0.016635,0.016085,0.016085,2.17243
4,Lasso,0.7748,0.749105,0.749105,1.097016
5,Ridge,0.776021,0.751339,0.751339,1.09212
6,ElasticNet,0.668122,0.645511,0.645511,1.303973


In [138]:
# The best model for predicting housing prices is Linear Regression, which yields the following results:
# train score: 0.77
# test score: 0.75
# R_squared: 0.75
# RMSE (in millions): 1.0935 → This is the lowest RMSE value achieved.