In [1]:
import matplotlib.pyplot as plt
import pandas as pd

import numpy as np # numpy library
import scipy . linalg as lng # linear algebra from scipy library
from scipy . spatial import distance # load distance function
from sklearn import preprocessing as preproc # load preprocessing function
from scipy import stats


from sklearn.linear_model import ElasticNetCV       
from sklearn.datasets import make_regression        
from sklearn.model_selection import train_test_split



##### Data loading

In [7]:

# Loading the data into numpy arrays
X_train = np.loadtxt('../data/case1Data_Xtrain.csv', delimiter=',')
X_test = np.loadtxt('../data/case1Data_Xtest.csv', delimiter=',')
y_train = np.loadtxt('../data/case1Data_ytrain.csv', delimiter=',')
y_test = np.loadtxt('../data/case1Data_ytest.csv', delimiter=',')
X_new = np.loadtxt('../data/case1Data_Xnew_wrangled.csv', delimiter=',')

# Printing the shape of the data
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)
print("X_new: ", X_new.shape)

# Size of the training and test data
n_train = X_train.shape[0]
n_test = X_test.shape[0]
p = X_train.shape[1]

# Printing the size of the training and test data
print("n_train: ", n_train) # number of training samples
print("n_test: ", n_test) # number of test samples
print("p: ", p) # number of features/variables/columns/parameters

# Checking for missing values in the wrangled data
missing_values_X_train = np.isnan(X_train)
print("Number of missing values in X_train: ", np.sum(missing_values_X_train))
missing_values_X_test = np.isnan(X_test)
print("Number of missing values in X_test: ", np.sum(missing_values_X_test))
missing_values_y_train = np.isnan(y_train)
print("Number of missing values in y_train: ", np.sum(missing_values_y_train))
missing_values_y_test = np.isnan(y_test)
print("Number of missing values in y_test: ", np.sum(missing_values_y_test))

X_train:  (80, 116)
X_test:  (20, 116)
y_train:  (80,)
y_test:  (20,)
X_new:  (1000, 116)
n_train:  80
n_test:  20
p:  116
Number of missing values in X_train:  0
Number of missing values in X_test:  0
Number of missing values in y_train:  0
Number of missing values in y_test:  0


In [None]:
from sklearn.linear_model import ElasticNetCV
from sklearn.utils import resample
import numpy as np

# Assuming you have the following:
# X_train, y_train (training data with 100 observations)
# X_new (new dataset with 1000 observations and no y values)

# Step 1: Train the model on the initial 100 observations
model = ElasticNetCV(cv=10)  # ElasticNetCV with 10-fold cross-validation
model.fit(X_train, y_train)

# Step 2: Predict on the new dataset (1000 observations)
y_pred_new = model.predict(X_new)  # Predictions for the new dataset

# Now, we'll calculate the EPE using the following components:

# Step 3: Estimate the Bias using cross-validation
from sklearn.model_selection import cross_val_predict

cv_predictions = cross_val_predict(model, X_train, y_train, cv=10)
bias = np.mean(cv_predictions) - np.mean(y_train)  # Bias is the difference between predicted and true mean
bias_squared = bias ** 2

# Step 4: Estimate the Variance using bootstrap resampling
n_iterations = 100  # Number of bootstrap iterations
predictions = []

for _ in range(n_iterations):
    # Resample data (bootstrap)
    X_resample, y_resample = resample(X_train, y_train, n_samples=len(X_train), random_state=42)
    
    # Train model on resampled data and predict on X_new
    model.fit(X_resample, y_resample)
    predictions.append(model.predict(X_new))  # Collect predictions for X_new

# Calculate variance of predictions
variance = np.var(predictions, axis=0).mean()

# Step 5: Estimate the Irreducible Error (σ²ₑ)
# This can be approximated as the variance in y_train
irreducible_error = np.var(y_train)

# Step 6: Calculate the Total EPE
EPE = irreducible_error + bias_squared + variance

# Print the result
print(f"Total EPE for the new dataset = {EPE}")



In [10]:
np.sqrt(EPE)

np.float64(69.54715769233908)

In [9]:

# Print the best alpha and l1_ratio found                
print("Optimal alpha:", model.alpha_)                    
print("Optimal l1_ratio:", model.l1_ratio_)              
# Evaluate the model on the test set                     
print("Test set R^2 score:", model.score(X_test, y_test))


#predict the target values
y_pred = model.predict(X_test)
# Calculates RMSE 
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("RMSE:", rmse)


Optimal alpha: 0.11916449907914212
Optimal l1_ratio: 0.5
Test set R^2 score: 0.7441530121544222
RMSE: 40.86024028863906
