In [1]:
# Imports
import os
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from functions_variables import * # Just in case, but probably not needed.

# Working directories
processed_path = '../processed_data'
output_path = '../outputs'

# Load the data
train = pd.read_csv(os.path.join(processed_path, 'training_data_processed.csv'))
test = pd.read_csv(os.path.join(processed_path, 'testing_data_processed.csv'))

In [2]:
# train.shape, train.columns, train.info()

In [3]:
train.shape, test.shape

((1066, 31), (267, 31))

## Model Selection

This notebook should include preliminary and baseline modeling.
- Try as many different models as possible.
- Don't worry about hyperparameter tuning or cross validation here.
- Ideas include:
    - linear regression
    - support vector machines
    - random forest
    - xgboost

Consider what metrics you want to use to evaluate success.
- If you think about mean squared error, can we actually relate to the amount of error?
- Try root mean squared error so that error is closer to the original units (dollars)
- What does RMSE do to outliers?
- Is mean absolute error a good metric for this problem?
- What about R^2? Adjusted R^2?
- Briefly describe your reasons for picking the metrics you use

In [4]:
# Import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [5]:
# Defining (X) and target (y)
X_train = train.drop(columns=['description_sold_price'])
y_train = train['description_sold_price']

X_test = test.drop(columns=['description_sold_price'])
y_test = test['description_sold_price']

#### - Linear Regression

In [6]:
# Imports
from sklearn.linear_model import LinearRegression

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [7]:
# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

Linear Regression Metrics

In [8]:
# Train
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Test
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print("--- Train Set ---")
print(f"Train MAE: {train_mae:.2f}")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Train R² Score: {train_r2:.4f}")
print("\n--- Test Set ---")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test R² Score: {test_r2:.4f}")

--- Train Set ---
Train MAE: 0.30
Train RMSE: 0.43
Train R² Score: 0.6905

--- Test Set ---
Test MAE: 0.30
Test RMSE: 0.39
Test R² Score: 0.7447


This model performed reasonably well with R^2 = 0.69 on training data and R^2 = 0.74. This means it explains about 69-74% of the variance in the sale price.

The test performance is also slightly better than train, meaning no overfitting happened and the model is able to generalize well to new data.

Trying RidgeCV, LassoCV and ElasticNetCV to see if we can make it better.

##### RidgeCV, LassoCV, and ElasticNetCV

In [9]:
# Imports
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV

# Define a range of alpha values to test
alpha_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

RidgeCV

In [10]:
ridge_cv = RidgeCV(alphas=alpha_values, store_cv_results=True)
ridge_cv.fit(X_train, y_train)

# Print the best alpha
print(f"Best alpha for Ridge Regression: {ridge_cv.alpha_}")

# Evaluate Ridge model
ridge_r2 = ridge_cv.score(X_test, y_test)
print(f"Ridge Test R² Score: {ridge_r2:.4f}")

Best alpha for Ridge Regression: 10.0
Ridge Test R² Score: 0.7408


> The Ridge R^2 is 0.7408, which is not very far from the vanilla LR.

>Ridge Regression did not significantly harm or improve the model. Let's try Lasso Regression.

LassoCV

In [11]:
lasso_cv = LassoCV(alphas=alpha_values, cv=5, max_iter=10000)
lasso_cv.fit(X_train, y_train)

# Print best alpha for Lasso
print(f"Best alpha for Lasso Regression: {lasso_cv.alpha_}")

# Evaluate Lasso model
lasso_r2 = lasso_cv.score(X_test, y_test)
print(f"Lasso Test R² Score: {lasso_r2:.4f}")

Best alpha for Lasso Regression: 0.001
Lasso Test R² Score: 0.7430


> Same thing. ElasticNet next.

ElasticNetCV

In [12]:
elastic_cv = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9], alphas=alpha_values, cv=5)
elastic_cv.fit(X_train, y_train)

print(f"Best alpha for ElasticNet: {elastic_cv.alpha_}")
print(f"Best l1_ratio for ElasticNet: {elastic_cv.l1_ratio_}")
print(f"ElasticNet Test R² Score: {elastic_cv.score(X_test, y_test):.4f}")

Best alpha for ElasticNet: 0.001
Best l1_ratio for ElasticNet: 0.9
ElasticNet Test R² Score: 0.7431


> Same thing. This mainly tells us that the dataset is not suffering from multicollinearity or hihg-dimensionality issues.

> The ElasticNet decided on 90% lasso and 10% ridge, implying that most features are relevant but some needed minor penalization.

#### - Support Vector Machine (SVM) for Regression

In [13]:
# Imports
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
# Standardize features before applying SVR. SVR is sensitive to feature scaling.
svr = make_pipeline(StandardScaler(), SVR(kernel='rbf',
                                          C=1.0,
                                          epsilon=0.05))

# Train on training data
svr.fit(X_train, y_train)

# Predict on both train and test sets
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

SVM Metrics

In [15]:
# Train performance
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Test performance
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print("--- Train Set ---")
print(f"Train MAE: {train_mae:.2f}")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Train R² Score: {train_r2:.4f}")
print("\n--- Test Set ---")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test R² Score: {test_r2:.4f}")

--- Train Set ---
Train MAE: 0.15
Train RMSE: 0.31
Train R² Score: 0.8466

--- Test Set ---
Test MAE: 0.26
Test RMSE: 0.37
Test R² Score: 0.7683


> This model's Train MAE (0.15) and Test MAE (0.26) is lower than the previous models meaning it makes better predictions. The RMSE supports this.

> May need finetuned as it seems to have slightly overfit the training data.

#### Random Forest

In [16]:
# Imports
from sklearn.ensemble import RandomForestRegressor

# Initialize and train
rf = RandomForestRegressor(n_estimators=100,
                           max_depth=10,
                           min_samples_split=6,
                           min_samples_leaf=6,
                           random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

Random Forest Metrics

In [17]:
# Train performance
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Test performance
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print("--- Train Set ---")
print(f"Train MAE: {train_mae:.2f}")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Train R² Score: {train_r2:.4f}")
print("\n--- Test Set ---")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test R² Score: {test_r2:.4f}")

--- Train Set ---
Train MAE: 0.21
Train RMSE: 0.33
Train R² Score: 0.8169

--- Test Set ---
Test MAE: 0.28
Test RMSE: 0.40
Test R² Score: 0.7293


> This model performed well, but not better than SVR.

- Random Forest's Test R² (0.7293) is lower than SVR's 0.7683, meaning SVR captured variance better.
- Random Forest has higher RMSE and MAE, meaning slightly worse predictions.

> The train-test gap is lower, suggesting less overfitting.
- Random Forest Train R²: 0.8169 vs Test R²: 0.7293 indicates a Smaller gap than SVR, suggesting RF generalizes slightly better than SVR.

> RF may benefit from finetuning as well.

### XGBoost

In [18]:
# Imports
from xgboost import XGBRegressor

In [19]:
# Initialize XGBoost Regressor
xgb = XGBRegressor(n_estimators=100,
                   learning_rate=0.1,
                   max_depth=5,
                   reg_lambda=12,
                   reg_alpha=4,
                   random_state=42)

# Train and predict
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

XGBoost Metrics

In [20]:
# Train performance
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Test performance
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print("--- Train Set ---")
print(f"Train MAE: {train_mae:.2f}")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Train R² Score: {train_r2:.4f}")
print("\n--- Test Set ---")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test R² Score: {test_r2:.4f}")

--- Train Set ---
Train MAE: 0.20
Train RMSE: 0.30
Train R² Score: 0.8494

--- Test Set ---
Test MAE: 0.27
Test RMSE: 0.37
Test R² Score: 0.7758


> This model (R^2 = 0.7758 on test) performed slightly better than SVR (R^2 = 0.7683) and RF (R^2 = 0.7293)

> This may be the best model to finetune based on the R^2 score.

### Conclusion

| Model              | Train MAE | Test MAE | Train RMSE | Test RMSE | Train R^2 | Test R^2 |
|--------------------|----------|----------|------------|-----------|----------|---------|
| **Linear Regression** | 0.30     | 0.30     | 0.43       | 0.39      | 0.6905   | 0.7447  |
| **SVR**           | 0.15     | 0.26     | 0.31       | 0.37      | 0.8466   | 0.7683  |
| **Random Forest**  | 0.21     | 0.28     | 0.33       | 0.40      | 0.8169   | 0.7293  |
| **XGBoost**       | 0.20     | 0.27     | 0.30       | 0.37      | 0.8494   | 0.7758  |


- **XGBoost performed the best overall** with the highest Test R² (0.7758) and balanced Train/Test error.

- **SVR also did well**, where the MAE is lower (makes less errors), but it had a slightly larger gap between Train and Test R^2, meaning it doesn't generalize as nicely.

- **Random Forest performed decently**, but it had a slightly lower test score than SVR and XGBoost.

- **Linear Regression had the weakest performance**, suggesting that the relationships in the data are non-linear.

## Feature Selection - STRETCH

> **This step doesn't need to be part of your Minimum Viable Product (MVP), but its recommended you complete it if you have time!**

Even with all the preprocessing we did in Notebook 1, you probably still have a lot of features. Are they all important for prediction?

Investigate some feature selection algorithms (Lasso, RFE, Forward/Backward Selection)
- Perform feature selection to get a reduced subset of your original features
- Refit your models with this reduced dimensionality - how does performance change on your chosen metrics?
- Based on this, should you include feature selection in your final pipeline? Explain

Remember, feature selection often doesn't directly improve performance, but if performance remains the same, a simpler model is often preferrable. 



In [21]:
# perform feature selection 
# refit models
# gather evaluation metrics and compare to the previous step (full feature set)

##### OLS Regression

Checking for the features contributing the most to the model.

In [22]:
# Add intercept term
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit OLS (Ordinary Least Squares) model
ols_model = sm.OLS(y_train, X_train_sm).fit()

# Print summary (includes R² and Adjusted R²)
print(ols_model.summary())

                              OLS Regression Results                              
Dep. Variable:     description_sold_price   R-squared:                       0.691
Model:                                OLS   Adj. R-squared:                  0.682
Method:                     Least Squares   F-statistic:                     76.97
Date:                    Fri, 07 Feb 2025   Prob (F-statistic):          2.02e-239
Time:                            12:16:46   Log-Likelihood:                -623.86
No. Observations:                    1066   AIC:                             1310.
Df Residuals:                        1035   BIC:                             1464.
Df Model:                              30                                         
Covariance Type:                nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------