In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from itertools import combinations


df = pd.read_csv("forestfires.csv")
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [6]:
#necessary conversions and setting up features and log area response
from sklearn.preprocessing import LabelEncoder

df['log_area'] = np.log(df['area'] + 1)

X = df.drop(columns=['area', 'log_area'])
y = df['log_area']

# Creating a LabelEncoder object
le_month = LabelEncoder()
le_day = LabelEncoder()

# Applying the actual labelEncoder to 'month' and 'day' columns
df['month'] = le_month.fit_transform(df['month'])
df['day'] = le_day.fit_transform(df['day'])

df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,log_area
0,7,5,7,0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,0.0
1,7,4,9,5,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0.0
2,7,4,9,2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0.0
3,8,6,7,0,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,0.0
4,8,6,7,3,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0.0


In [7]:
loo = LeaveOneOut()
errors = []

# LOOCV for Linear Regression
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    errors.append((y_pred - y_test.values[0]) ** 2)

test_mse_linear = np.mean(errors)
print(f"Test MSE (Linear Regression): {test_mse_linear:.4f}")

Test MSE (Linear Regression): 2.0748


In [9]:
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, LeaveOneOut

# Convert DataFrame X to a NumPy array if it isn't already
X = np.array(X)  # Ensures X is a NumPy array
y = np.array(y)  # Convert y to a NumPy array if needed

# Best-Subset Selection Function
def best_subset_selection(X, y):
    best_adj_r2 = -np.inf
    best_model = None
    best_predictors = None
    for k in range(1, X.shape[1] + 1):
        for subset in combinations(range(X.shape[1]), k):
            # Select subset of predictors
            X_subset = X[:, list(subset)]  # Use list(subset) for correct indexing
            X_subset = sm.add_constant(X_subset)  # Add intercept for statsmodels
            # Fit the model and calculate adjusted R-squared
            model = sm.OLS(y, X_subset).fit()
            adj_r2 = model.rsquared_adj
            # Update best model if improved adjusted R-squared
            if adj_r2 > best_adj_r2:
                best_adj_r2 = adj_r2
                best_model = model
                best_predictors = subset
    return best_model, best_predictors, best_adj_r2

# Run best-subset selection with updated function
best_model_b, best_predictors_b, best_adj_r2_b = best_subset_selection(X, y)
print("Best predictors based on adjusted R^2:", best_predictors_b)
print("Best adjusted R^2:", best_adj_r2_b)

# Test MSE for Best-Subset Model using LOOCV
X_best_subset = X[:, list(best_predictors_b)]  # Select only best predictors
loo = LeaveOneOut()
scores = cross_val_score(LinearRegression(), X_best_subset, y, cv=loo, scoring='neg_mean_squared_error')
test_mse_b = -scores.mean()  # Convert from negative MSE to positive MSE
print("Test MSE for Best-Subset Model:", test_mse_b)

Best predictors based on adjusted R^2: (0, 2, 5, 7, 9, 10)
Best adjusted R^2: 0.01167172774058478
Test MSE for Best-Subset Model: 1.9292354993506742


In [12]:
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.linear_model import LinearRegression

# Forward Stepwise Selection
def forward_stepwise_selection(X, y):
    remaining = list(range(X.shape[1]))  # Indices of remaining features
    selected = []  # Indices of selected features
    best_adj_r2 = -np.inf  # Initialize best adjusted R^2
    while remaining:
        adj_r2_list = []  # List to hold adjusted R^2 for current candidates
        for i in remaining:
            # Fit model with selected features and candidate feature i
            model = sm.OLS(y, sm.add_constant(X[:, selected + [i]])).fit()
            adj_r2_list.append((model.rsquared_adj, i))
        adj_r2, best_idx = max(adj_r2_list)  # Select feature with best adjusted R^2
        if adj_r2 > best_adj_r2:
            best_adj_r2 = adj_r2
            selected.append(best_idx)
            remaining.remove(best_idx)
        else:
            break
    return selected, best_adj_r2

# Run forward stepwise selection to select best predictors
best_predictors_c, best_adj_r2_c = forward_stepwise_selection(X, y)

# Subset X to only the selected predictors
X_forward = X[:, best_predictors_c]  # Use NumPy indexing for the subset

# Define the model and use Scikit-Learn's cross_val_score with LOOCV
model = LinearRegression()
loo = LeaveOneOut()
scores = cross_val_score(model, X_forward, y, cv=loo, scoring='neg_mean_squared_error')

# Calculate the average test MSE from LOOCV
test_mse_c = -np.mean(scores)  # Negate the mean of scores to get positive MSE
print("Test MSE for Forward Stepwise Selection:", test_mse_c)
print("Best adjusted R^2 (forward):", best_adj_r2_c)

Test MSE for Forward Stepwise Selection: 1.9256249821747295


In [34]:
def backward_stepwise_selection(X, y):
    # Ensure X is a DataFrame; no conversion needed
    if not isinstance(X, pd.DataFrame):
        raise ValueError("X must be a Pandas DataFrame.")
    
    selected = list(range(X.shape[1]))  # Start with all predictors
    best_adj_r2 = -np.inf

    while selected:
        adj_r2_list = []
        for i in selected:
            # Fit model with all selected predictors except i
            model = sm.OLS(y, sm.add_constant(X.iloc[:, [j for j in selected if j != i]])).fit()
            adj_r2_list.append((model.rsquared_adj, i))
        
        adj_r2, worst_idx = max(adj_r2_list)  # Select feature with worst adjusted R^2
        if adj_r2 > best_adj_r2:
            break
        else:
            selected.remove(worst_idx)  # Remove the worst predictor

    best_model = sm.OLS(y, sm.add_constant(X.iloc[:, selected])).fit()
    return best_model, selected, best_model.rsquared_adj

# Example DataFrame setup
# Make sure to replace this with your actual DataFrame
X = pd.DataFrame(np.random.rand(100, 12), columns=[f'feature_{i}' for i in range(12)])
y = np.random.rand(100)  # Example target variable

# Run backward stepwise selection
best_model_d, best_predictors_d, best_adj_r2_d = backward_stepwise_selection(X, y)
print("Best predictors (Backward):", best_predictors_d)
print("Best adjusted R^2 (Backward):", best_adj_r2_d)

# Select the best predictors for the model
X_best_backward = X.iloc[:, best_predictors_d]  # Keep X as a DataFrame

# Define the model
model = LinearRegression()

# Use cross_val_score for LOOCV and compute the test MSE
loo = LeaveOneOut()
scores = cross_val_score(model, X_best_backward, y, cv=loo, scoring='neg_mean_squared_error')

# Calculate the average test MSE from LOOCV
test_mse_d = -np.mean(scores)  # Negate to get positive MSE
print("Test MSE for Backward Stepwise Model:", test_mse_d)

Best predictors (Backward): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Best adjusted R^2 (Backward): 0.08306196725008275
Test MSE for Backward Stepwise Model: 0.09730908841597381


In [18]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut, GridSearchCV

# (e) Ridge Regression with LOOCV
alpha_values = np.logspace(-3, 3, 100)  # Define a range of alpha values
ridge_model = Ridge()
loo = LeaveOneOut()
param_grid = {'alpha': alpha_values}
ridge_cv = GridSearchCV(ridge_model, param_grid, cv=loo, scoring='neg_mean_squared_error')
ridge_cv.fit(X, y)

best_alpha_ridge = ridge_cv.best_params_['alpha']
ridge_test_mse = -ridge_cv.best_score_
print("Best alpha (Ridge):", best_alpha_ridge)
print("Test MSE (Ridge):", ridge_test_mse)

Best alpha (Ridge): 1000.0
Test MSE (Ridge): 1.948643063564574


In [19]:
# (f) Lasso Regression with LOOCV
lasso_model = Lasso()
lasso_cv = GridSearchCV(lasso_model, param_grid, cv=loo, scoring='neg_mean_squared_error')
lasso_cv.fit(X, y)

best_alpha_lasso = lasso_cv.best_params_['alpha']
lasso_test_mse = -lasso_cv.best_score_
print("Best alpha (Lasso):", best_alpha_lasso)
print("Test MSE (Lasso):", lasso_test_mse)

Best alpha (Lasso): 2.4770763559917115
Test MSE (Lasso): 1.9307692843267776


In [33]:
# (g) Summary Table
summary = pd.DataFrame({
    'Model': ['Linear Regression', 'Best Subset','Forward Selection', 'Backward Selection', 'Ridge', 'Lasso'],
    'Test MSE': [test_mse_linear,test_mse_b, test_mse_c, test_mse_d, ridge_test_mse, lasso_test_mse],
    'Best Adjusted R^2': [None, best_adj_r2_b, best_adj_r2_c, best_adj_r2_d, None, None],
    'Penalty':[None, None, None, None,best_alpha_ridge,best_alpha_lasso]
})

summary


Unnamed: 0,Model,Test MSE,Best Adjusted R^2,Penalty
0,Linear Regression,2.074779,,
1,Best Subset,1.929235,0.011672,
2,Forward Selection,1.925625,0.010765,
3,Backward Selection,0.097273,-0.032334,
4,Ridge,1.948643,,1000.0
5,Lasso,1.930769,,2.477076
