# Case 1 - Ordinary Least Squares (OLS), Ridge Regression, and Elastic-Net Regression

1. **Importing Libraries**

2. **Loading Data**

3. **Ordinary Least Squares (OLS)**

4. **Ridge (L2) Regression**

5. **Lasso (L1) Regression**

6. **Elastic Net Regression**
- Elastic Net Regression because we have many variables that we do not know. Elastic Net combines the strengths of Lasso regression (L1) and Ridge regression. Lasso regression can shrink parameters to 0 which is useful for large dataset where some parameters might be useless. Ridge regression tends to perform better when parameters are not useless. Therefore, elastic net is useful in this case as we do not know our parameters.
- Find optimal model parameters, lamdba_1 and lambda_2 by using 5-fold cross validation.
- Get root mean squared error (RMSE) by applying the model with the optimal model parameters on the test data.

## 1. Importing Libraries

In [1]:
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Set searborn as default

import scipy . linalg as lng

from sklearn import linear_model

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold

import warnings

# Set seed for reproducibility
import random
random.seed(42)

## 2. Loading Data

In [2]:
# Loading the data into numpy arrays
X = np.loadtxt('../data/case1Data_X.csv', delimiter=',')
y = np.loadtxt('../data/case1Data_y.csv', delimiter=',')

In [None]:
# Setting a range of alphas and l1_ratios to test
alphas = np.logspace(-4, 1, 100) # Testing a range from very weak to strong regularization
l1_ratios = np.concatenate(([0], np.logspace(-10, 0, 100))) # Testing a range from L2 (0) to L1 (1) regularization

# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
alpha_scores = []
l1_ratio_scores = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inner loop for hyperparameter tuning (5-fold CV)
    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        model = ElasticNetCV(cv=5, l1_ratio=l1_ratios, alphas=alphas, fit_intercept=False).fit(X_train, y_train)

    # Evaluate on the outer test set
    y_pred = model.predict(X_test)

    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Save the optimal alpha
    alpha_scores.append(model.alpha_)

    # Save the optimal l1_ratio
    l1_ratio_scores.append(model.l1_ratio_)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_model = model

    # Print the results of the inner loop
    print(f'Fold RMSE: {rmse:.4f}')
    print(f'Optimal alpha: {model.alpha_}')
    print(f'Optimal l1_ratio: {model.l1_ratio_}\n')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
print(f'Average alpha: {np.mean(alpha_scores)}')
print(f'Average l1_ratio: {np.mean(l1_ratio_scores)}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')


# Models

## 4. Ordinary Least Squares (OLS)

In [None]:
# Function to solve the OLS
def ols_solver(X, y):
    betas, res, rnk, s = lng.lstsq(X, y)
    return betas, res, rnk, s

# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inner loop for hyperparameter tuning (5-fold CV)
    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        rmse_scores_inner = []
        for train_idx_inner, test_idx_inner in inner_cv.split(X_train, y_train):
            X_train_inner, X_test_inner = X_train[train_idx_inner], X_train[test_idx_inner]
            y_train_inner, y_test_inner = y_train[train_idx_inner], y_train[test_idx_inner]

            betas, _, _, _ = ols_solver(X_train_inner, y_train_inner)

            # Evaluate on the inner test set
            y_pred = X_test_inner @ betas

            # Calculate the RMSE
            rmse_inner = np.sqrt(mean_squared_error(y_test_inner, y_pred))
            rmse_scores_inner.append(rmse_inner)

            # Saving the optimal model
            if rmse_inner == min(rmse_scores_inner):
                best_betas_inner = betas

    # Evaluate on the outer test set
    y_pred = X_test @ best_betas_inner

    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_betas = best_betas_inner

    # Print the results of the inner loop
    print(f'Fold RMSE: {rmse:.4f}')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')


Fold RMSE: 51.7609
Fold RMSE: 41.5011
Fold RMSE: 36.1615
Fold RMSE: 58.5193
Fold RMSE: 45.9665
Average RMSE across outer folds: 46.7819
Standard deviation of RMSE: 7.7962


# 4. Ridge (L2) Regression

In [8]:
# Setting k for the number of lambdas to test
k = 1000

# Creating a list of lambdas to test
lambdas = np.logspace(-1, 4, k)

# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inner loop for hyperparameter tuning (5-fold CV)
    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        model = RidgeCV(alphas=lambdas, store_cv_values=False, cv=5).fit(X_train, y_train)

    # Evaluate on the outer test set
    y_pred = model.predict(X_test)

    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_model = model

    # Print the results of the inner loop
    print(f'Fold RMSE: {rmse:.4f}')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')

Fold RMSE: 33.5101
Fold RMSE: 29.7820
Fold RMSE: 28.9537
Fold RMSE: 34.3106
Fold RMSE: 29.9738
Average RMSE across outer folds: 31.3060
Standard deviation of RMSE: 2.1687


# 5. Lasso (L1) Regression

In [10]:
# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
models = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inner loop for hyperparameter tuning (5-fold CV)
    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        model = LassoCV(cv=5, random_state=42).fit(X_train, y_train)

    # Evaluate on the outer test set
    y_pred = model.predict(X_test)

    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Saving the model
    models.append(model)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_model = model

    # Print the results of the inner loop
    print(f'Fold RMSE: {rmse:.4f}')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')

Fold RMSE: 28.5733
Fold RMSE: 21.1610
Fold RMSE: 24.4641
Fold RMSE: 24.5089
Fold RMSE: 28.4905
Average RMSE across outer folds: 25.4396
Standard deviation of RMSE: 2.8019


## 6. Predicting $\hat{y}$ in the new data set (case1Data_Xnew.csv)

### Loading case1Data_Xnew_wrangled.csv

In [11]:
# Loading the data into numpy arrays
X_new = np.loadtxt('../data/case1Data_Xnew_wrangled.csv', delimiter=',')
print("X_new: ", X_new.shape)


X_new:  (1000, 116)


### Predicting and saving predictions in a new file

In [None]:
# Predicting y_hat for the data in case1Data_Xnew.csv using the best model
y_hat_new = best_model.predict(X_new)

# Printing the shape of the new data
print(y_hat_new.shape)

# Saving the predictions to a csv file
np.savetxt('../results/sample_predictions_s183220_s225001.csv', y_hat_new, delimiter='\n')

(1000,)


In [12]:
# Predicting y_hat for the data in case1Data_Xnew.csv using an ensemble of the models
y_hat_new_ensemble = np.zeros(X_new.shape[0])
for model in models:
    y_hat_new_ensemble += model.predict(X_new)
y_hat_new_ensemble /= len(models)

# Saving the predictions to a csv file
np.savetxt('../results/sample_predictions_s183220_s225001.csv', y_hat_new_ensemble, delimiter='\n')

### Writing the Estimated RMSE to File

In [None]:
# We will use the average RMSE across the outer folds as the expected RMSE
# Saving the value as float with 2 decimals precision
print(f'Expected RMSE: {np.mean(rmse_scores):.2f}')
np.savetxt('../results/sample_estimatedRMSE_s183220_s225001.csv', [np.mean(rmse_scores)], fmt='%.2f')

Expected RMSE: 25.73
