# Benchmark code
https://www.kaggle.com/code/sungkeum/eda-and-lr-random-forest-accuracy-86

* Linear Regression from the link above as baseline
* RandomForestRegression as the benchmark

In [21]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import r2_score, make_scorer, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('./data/encoded_data.csv')
seed = 42  # for reproducibility

In [46]:
response = 'Price'
df.dropna(inplace=True)
df.shape
X = df.drop([response], axis=1)
y = df[response]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=seed)


## The truned model

In [47]:
results_df = pd.read_csv('./data/model_results.csv', index_col=0)
best_model = results_df['R2'].idxmax()
parameter_alpha = results_df.loc[best_model, 'Parameter alpha']
parameter_l1_ratio = results_df.loc[best_model, 'Parameter l1_ratio']
print(f"The best model is {best_model} with alpha={parameter_alpha} and l1_ratio={parameter_l1_ratio}")
results_df

The best model is Ridge Regression with alpha=70.0 and l1_ratio=nan


Unnamed: 0,R2,Parameter alpha,Parameter l1_ratio
Ridge Regression,0.742394,70.0,
Lasso Regression,0.741236,55.0,
ElasticNet using Grid search,0.724188,0.2,0.3
ElasticNet using Randomized search,0.667084,1.6,0.7


In [53]:
ridge = Ridge(alpha=parameter_alpha)
ridge.fit(X_train, y_train)

In [54]:
# predict
y_ridge_train_pred = ridge.predict(X_train)
y_ridge_test_pred = ridge.predict(X_test)

In [55]:
# validate
ridge_train_mse = mean_squared_error(y_train, y_ridge_train_pred)
ridge_train_r2 = r2_score(y_train, y_ridge_train_pred)

ridge_test_mse = mean_squared_error(y_test, y_ridge_test_pred)
ridge_test_r2 = r2_score(y_test, y_ridge_test_pred)

In [56]:
ridge_results = pd.DataFrame(['Ridge regression', ridge_train_mse, ridge_train_r2, ridge_test_mse, ridge_test_r2]).transpose()
ridge_results.columns = ['Method','Training MSE','Training R2','Test MSE','Test R2']

ridge_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Ridge regression,307419170.13927,0.5974,328490735.793455,0.588587


## Linear Regression as baseline

In [6]:
# set and train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

In [7]:
# predict
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

In [8]:
# validate
lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)

In [9]:
lr_results = pd.DataFrame(['Linear regression', lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ['Method','Training MSE','Training R2','Test MSE','Test R2']

lr_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Linear regression,228246917.673212,0.701085,268979977.360321,0.663121


## Random Forest Regression as benchmark

In [10]:
# set and train the model
rf = RandomForestRegressor(random_state=seed)
rf.fit(X_train, y_train)

In [11]:
# predict
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

In [12]:
rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [13]:
rf_results = pd.DataFrame(['Random Forest Regression', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Method','Training MSE','Training R2','Test MSE','Test R2']

rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest Regression,16657480.039707,0.978185,161424897.894143,0.797826


In [22]:
StandardScaler().fit_transform([[0,1,2,3,4,5,6],[7,8,9,10,11,12,13]])

array([[-1., -1., -1., -1., -1., -1., -1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]])