In [5]:
# Had issues with SSL Certification, imported ssl and urllib.request
import ssl
import urllib.request
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import fetch_california_housing

# Had issues with SSL Certification, added these lines to bypass SSL verification:
ssl_context = ssl._create_unverified_context()
original_urlopen = urllib.request.urlopen
def patched_urlopen(url, *args, **kwargs):
    if 'context' not in kwargs:
        kwargs['context'] = ssl_context
    return original_urlopen(url, *args, **kwargs)
urllib.request.urlopen = patched_urlopen

def polynomial_regression(degree, X, y, folds, test_size=0.25, random_state=None):
    # Define number of folds for cross-validation
    kf = KFold(n_splits=folds)
    
    # Initialize lists to store results
    variances = []
    bias2s = []
    total_errors = []
    models = []
    
    # Set the polynomial degree of the model
    poly_features = PolynomialFeatures(degree)
    X_poly = poly_features.fit_transform(X)
    
    # Perform cross-validation
    for train_index, test_index in kf.split(X_poly):
        # Split data into training and testing sets for this fold
        X_train, X_test = X_poly[train_index], X_poly[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Fit polynomial regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Calculate variance and bias for this fold *
        variance = np.var(y_pred)
        bias2 = np.mean((y_test - y_pred)**2)
        
        # Append results to lists *
        variances.append(variance)
        bias2s.append(bias2)
        total_errors.append(variance + bias2)
        models.append(model)
        
        # Print results for this fold
        print(f"Variance: {variance:.4f}, Bias2: {bias2:.4f}, Total error: {variance + bias2:.4f}")
    
    # print the total_error of the best model
    min_error_index = np.argmin(total_errors)
    best_model = models[min_error_index]
    print(f"Best model total error: {total_errors[min_error_index]:.4f}")
    
    # Testing the final model on the test data *
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=test_size, random_state=random_state)
    
    # Obtain the predictions on the test data *
    y_pred_test = best_model.predict(X_test)
    
    # store mse score of the model applied on the test data *
    mse = mean_squared_error(y_test, y_pred_test)
    
    return mse, best_model

# Load the California Housing Dataset
housing = fetch_california_housing()

# Select input attributes 1, 3, and 4 as input features
X = housing.data[:, [0, 2, 3]]

# Set the target variable
y = housing.target

degrees = range(1, 6)  # Try polynomial degrees from 1 to 5

for degree in degrees:
    print(f"\nDegree: {degree}")
    mse, _ = polynomial_regression(degree, X, y, folds=5)
    print(f"MSE: {mse:.4f}")


Degree: 1
Variance: 0.6159, Bias2: 0.5454, Total error: 1.1613
Variance: 0.6857, Bias2: 0.6744, Total error: 1.3602
Variance: 0.8001, Bias2: 0.7296, Total error: 1.5296
Variance: 0.4341, Bias2: 0.7385, Total error: 1.1726
Variance: 0.7587, Bias2: 0.6649, Total error: 1.4236
Best model total error: 1.1613
MSE: 0.6641

Degree: 2
Variance: 1.4212, Bias2: 1.0627, Total error: 2.4839
Variance: 0.6013, Bias2: 0.6217, Total error: 1.2230
Variance: 0.8027, Bias2: 0.7064, Total error: 1.5091
Variance: 0.5257, Bias2: 0.7215, Total error: 1.2472
Variance: 0.8059, Bias2: 0.6307, Total error: 1.4366
Best model total error: 1.2230
MSE: 0.6166

Degree: 3
Variance: 11.4957, Bias2: 11.5136, Total error: 23.0094
Variance: 0.5895, Bias2: 0.6084, Total error: 1.1978
Variance: 0.9781, Bias2: 0.7873, Total error: 1.7654
Variance: 0.5374, Bias2: 0.7130, Total error: 1.2503
Variance: 0.8490, Bias2: 0.6016, Total error: 1.4506
Best model total error: 1.1978
MSE: 0.6013

Degree: 4
Variance: 1749.0026, Bias2: 1