In [None]:
# if necessary import google drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# python script imports
import dataload_process

# package imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb


In [None]:
# Set a seed for reproducibility
np.random.seed(17)

# Load Data

In [None]:
# Define the path to your CSV file
# dataset used: https://drive.google.com/file/d/1vrxc_sT68YGYBgI9K15YlKWO7jQX8zu8/view?usp=drive_link
file_path = 'path/to/your/data.csv'


In [None]:
# load dataframe
df = pd.read_csv(file_path)

In [None]:
# categorical conlumns to conver to categorical type
columns_to_convert = ['neighbourhood_cleansed', 'day_of_week', 'year', 'month']

# unnecesary columns
columns_to_drop = ['id', 'host_id', 'date']

In [None]:
# convert and encode categorical columns from dataload_process.py
df = dataload_process.encode_categorical_columns(df, columns_to_convert, columns_to_drop)


In [None]:
# split data into seperate X and y dataframes
x_df, y = dataload_process.split_data(df, 'price')


# Model implementation

In [None]:
# Split the data into train and test sets 80/20
X_train, X_test, y_train, y_test = train_test_split(x_df, y, test_size=0.2, random_state=42)


In [None]:
# Scale the features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression with Lasso (L1) regularization

In [None]:
# Linear Regression with L1 Regularization (Lasso)
lasso = Lasso(alpha=0.01,
              max_iter=100000,
              random_state=17)

lasso.fit(X_train_scaled, y_train)



In [None]:
# Make predictions
lasso_pred = lasso.predict(X_test_scaled)

# Calculate RMSE, MAE, and MSE
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
lasso_mae = mean_absolute_error(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

In [None]:
print("Lasso R-squared:", lasso_r2)
print('Lasso Mean Squared Error (MSE):', lasso_mse)
print('Lasso Root Mean Squared Error (RMSE):', lasso_rmse)
print('Lasso Mean Absolute Error (MAE):', lasso_mae)

Lasso R-squared: 0.48373092835406806
Lasso Mean Squared Error (MSE): 5864.438587338736
Lasso Root Mean Squared Error (RMSE): 76.57962253327406
Lasso Mean Absolute Error (MAE): 56.693529171604425


## Linear Regression with Ridge (L2) regularization

In [None]:
# Linear Regression with L2 Regularization (Ridge)
ridge = Ridge(alpha=0.01,
              max_iter=100000,
              random_state=17)

ridge.fit(X_train_scaled, y_train)


In [None]:
# Make predictions
ridge_pred = ridge.predict(X_test_scaled)

# Calculate RMSE, MAE, and MSE
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)

In [None]:
print("Ridge R-squared:", ridge_r2)
print('Ridge Mean Squared Error (MSE):', ridge_mse)
print('Ridge Root Mean Squared Error (RMSE):', ridge_rmse)
print('Ridge Mean Absolute Error (MAE):', ridge_mae)

Ridge R-squared: 0.48562282514746724
Ridge Mean Squared Error (MSE): 5842.948025211704
Ridge Root Mean Squared Error (RMSE): 76.4391786011055
Ridge Mean Absolute Error (MAE): 56.55345674728505


## Linear Regression with Elastic Net regularization

In [None]:
# Linear Regression with L2 Regularization (Ridge)
elastic = ElasticNet(alpha=0.01,
                   max_iter=100000,
                   random_state=17)

elastic.fit(X_train_scaled, y_train)


In [None]:
# Make predictions
elastic_pred = elastic.predict(X_test_scaled)

# Calculate RMSE, MAE, and MSE
elastic_rmse = np.sqrt(mean_squared_error(y_test, elastic_pred))
elastic_mae = mean_absolute_error(y_test, elastic_pred)
elastic_mse = mean_squared_error(y_test, elastic_pred)
elastic_r2 = r2_score(y_test, elastic_pred)

In [None]:
print("Elastic R-squared:", elastic_r2)
print('Elastic Mean Squared Error (MSE):', elastic_mse)
print('Elastic Root Mean Squared Error (RMSE):', elastic_rmse)
print('Elastic Mean Absolute Error (MAE):', elastic_mae)

Elastic R-squared: 0.4795393434701207
Elastic Mean Squared Error (MSE): 5912.051922100701
Elastic Root Mean Squared Error (RMSE): 76.8898687871211
Elastic Mean Absolute Error (MAE): 57.08046197625295


## Random Forest

In [None]:
rf_regressor = RandomForestRegressor(random_state=17)

# Train the model on the training data
rf_regressor.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test data
rf_pred = rf_regressor.predict(X_test_scaled)

# Evaluate the model's performance
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_pred)

In [None]:
print("Random Forest R-squared:", rf_r2)
print('Random Forest Mean Squared Error (MSE):', rf_mse)
print('Random Forest Root Mean Squared Error (RMSE):', rf_rmse)
print('Random Forest Mean Absolute Error (MAE):', rf_mae)


Random Forest R-squared: 0.7942680907153807
Random Forest Mean Squared Error (MSE): 2336.963830913035
Random Forest Root Mean Squared Error (RMSE): 48.34215376783533
Random Forest Mean Absolute Error (MAE): 29.833846728665524


## XG Boost

In [None]:
xg_boost = xgb.XGBRegressor(random_state=17)

# Train the model on the training data
xg_boost.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test data
xgb_pred = xg_boost.predict(X_test_scaled)

# Evaluate the model's performance
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_pred)

In [None]:
print("XG Boost R-squared:", xgb_r2)
print('XG Boost Mean Squared Error (MSE):', xgb_mse)
print('XG Boost Root Mean Squared Error (RMSE):', xgb_rmse)
print('XG Boost Mean Absolute Error (MAE):', xgb_mae)

XG Boost R-squared: 0.7676086453712715
XG Boost Mean Squared Error (MSE): 2639.7956071699427
XG Boost Root Mean Squared Error (RMSE): 51.37894128113135
XG Boost Mean Absolute Error (MAE): 35.054171215149175


In [None]:
print("Random Forest")
print("RF R-squared:", rf_r2)
print("RF Mean Squared Error (MSE):", rf_mse)
print("RF Root Mean Squared Error (RMSE):", rf_rmse)
print("RF Mean Absolute Error (MAE):", rf_mae)

print()
print("XG boost")
print("XG Boost R-squared:", xgb_r2)
print('XG Boost Mean Squared Error (MSE):', xgb_mse)
print('XG Boost Root Mean Squared Error (RMSE):', xgb_rmse)
print('XG Boost Mean Absolute Error (MAE):', xgb_mae)
print()
print("Lasso")
print("Lasso R-squared:", lasso_r2)
print('Lasso Mean Squared Error (MSE):', lasso_mse)
print('Lasso Root Mean Squared Error (RMSE):', lasso_rmse)
print('Lasso Mean Absolute Error (MAE):', lasso_mae)

print()
print("Ridge")
print("Ridge R-squared:", ridge_r2)
print('Ridge Mean Squared Error (MSE):', ridge_mse)
print('Ridge Root Mean Squared Error (RMSE):', ridge_rmse)
print('Ridge Mean Absolute Error (MAE):', ridge_mae)
print()
print("Elastic Net")
print("Elastic R-squared:", elastic_r2)
print('Elastic Mean Squared Error (MSE):', elastic_mse)
print('Elastic Root Mean Squared Error (RMSE):', elastic_rmse)
print('Elastic Mean Absolute Error (MAE):', elastic_mae)

Random Forest
RF R-squared: 0.7942680907153807
RF Mean Squared Error (MSE): 2336.963830913035
RF Root Mean Squared Error (RMSE): 48.34215376783533
RF Mean Absolute Error (MAE): 29.833846728665524

XG boost
XG Boost R-squared: 0.7676086453712715
XG Boost Mean Squared Error (MSE): 2639.7956071699427
XG Boost Root Mean Squared Error (RMSE): 51.37894128113135
XG Boost Mean Absolute Error (MAE): 35.054171215149175

Lasso
Lasso R-squared: 0.48373092835406806
Lasso Mean Squared Error (MSE): 5864.438587338736
Lasso Root Mean Squared Error (RMSE): 76.57962253327406
Lasso Mean Absolute Error (MAE): 56.693529171604425

Ridge
Ridge R-squared: 0.48562282514746724
Ridge Mean Squared Error (MSE): 5842.948025211704
Ridge Root Mean Squared Error (RMSE): 76.4391786011055
Ridge Mean Absolute Error (MAE): 56.55345674728505

Elastic Net
Elastic R-squared: 0.4795393434701207
Elastic Mean Squared Error (MSE): 5912.051922100701
Elastic Root Mean Squared Error (RMSE): 76.8898687871211
Elastic Mean Absolute Err