**This Following code is implementation of Linear regression. Boston house prices dataset from Scikit Learn is used here and the goal is to estimate the price of a house in Boston using 13 attributes**

*A closed-form solution, a ridge regression model, a polynomial transformation
of degree 2 on the features, Lasso Regression are implemented using k-fold cross-validation and gradient descent*

In [None]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures

Following part is importing data 

In [152]:
with warnings.catch_warnings():
     warnings.filterwarnings("ignore")
     X, y = load_boston(return_X_y=True)

In [None]:
X,y

In [None]:
print("Feature Matrix Shape:", X.shape) 
print("Target Vector Shape:", y.shape) 
print("First 1 rows of Features:\n", X[:1])
print("First 1 target values:\n", y[:1])
print("Mean of each feature:\n", np.mean(X, axis=0))
print("Standard deviation of each feature:\n", np.std(X, axis=0))

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(y, bins=30, color='blue', edgecolor='black', alpha=0.7)
plt.xlabel("House Price ($1000s)")
plt.ylabel("Frequency")
plt.title("Distribution of House Prices")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(X[:,5], y, color='red', alpha=0.5)
plt.xlabel("Average Number of Rooms (RM)")
plt.ylabel("House Price ($1000s)")
plt.title("Number of Rooms vs. House Price")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [157]:
def find_theta(X,y):
    m = X.shape[0]
    theta = np.dot(np.linalg.inv(np.dot(X.T,X)),np.dot(X.T,y))
    return theta

In [158]:
def predict1(X,theta):
    preds = np.dot(X,theta)
    return preds

In [159]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [160]:
def add_bias_term(X):
    m = X.shape[0]
    X = np.append(X,np.ones((m,1)),axis=1)
    return X

In [161]:
def k_fold(X,y,k=5):
    kf = KFold(n_splits=k, shuffle=True,random_state=42)
    test_scores = []
    train_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        theta = find_theta(X_train,y_train)
        
        y_train_pred = predict1(X_train,theta)
        y_test_pred = predict1(X_test,theta)            
        train_scores.append(mean_squared_error(y_train_pred,y_train))
        test_scores.append(mean_squared_error(y_test_pred,y_test))
    return np.mean(train_scores),np.mean(test_scores)

In [162]:
X_b = add_bias_term(X)
avg_train_score,avg_test_score = k_fold(X_b,y,5)

In [None]:
print(f"Average Training MSE: {avg_train_score:.4f}")
print(f"Average Test MSE: {avg_test_score:.4f}")

The following seciton is for ridge regression model 

In [164]:
def ridge_regression(X,y,lambda_):
    m,n = X.shape
    I = np.eye(n)
    theta = np.linalg.inv(X.T @ X + lambda_ * I) @ X.T @ y
    return theta

In [165]:
def predict(X,theta):
    preds = np.dot(X,theta)
    return preds

In [166]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [167]:
def k_fold_cross_validation(X, y, lambdas, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    lambda_mse = []

    for lambda_ in lambdas:
        train_mse_list = []
        test_mse_list = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            theta = ridge_regression(X_train, y_train, lambda_)


            y_train_pred = predict(X_train, theta)
            y_test_pred = predict(X_test, theta)

            train_mse = mean_squared_error(y_train, y_train_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)

            train_mse_list.append(train_mse)
            test_mse_list.append(test_mse)

        avg_train_mse = np.mean(train_mse_list)
        avg_test_mse = np.mean(test_mse_list)

        lambda_mse.append((lambda_, avg_train_mse, avg_test_mse))

    return lambda_mse

In [None]:
lambdas = np.logspace(1, 7, num=13)
lambda_mse = k_fold_cross_validation(X_b, y, lambdas, k=5)
best_lambda, best_train_mse, best_test_mse = min(lambda_mse, key=lambda x: x[2])
print(f"Best λ: {best_lambda:.4f}")
print(f"Training MSE with best λ: {best_train_mse:.4f}")
print(f"Test MSE with best λ: {best_test_mse:.4f}")

The following is for polynomial closed form question 6

In [169]:
def poly_closed_form(X, y, lambda_=1e-6): 
    n_features = X.shape[1]
    I = np.eye(n_features)
    return np.linalg.inv(X.T @ X + lambda_ * I) @ X.T @ y

In [170]:
def poly_kfold(X,y,k=5):
    kf = KFold(n_splits=k, shuffle=True,random_state=42)
    test_scores = []
    train_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  

        theta = poly_closed_form(X_train,y_train)
        
        y_train_pred = X_train @ theta
        y_test_pred = X_test@ theta         
        train_scores.append(mean_squared_error(y_train_pred,y_train))
        test_scores.append(mean_squared_error(y_test_pred,y_test))
    return np.mean(train_scores),np.mean(test_scores)

In [171]:
X_bias = X_b[:,[-1]]
X_feature = X_b[:,:-1]
poly = PolynomialFeatures(degree=2,include_bias=False)
X_poly_features = poly.fit_transform(X_feature)
X_poly = np.hstack((X_poly_features,X_bias))

In [172]:
avg_train_score_ply,avg_test_score_ply = poly_kfold(X_poly,y)

In [None]:
print(f"Average Training MSE: {avg_train_score_ply:.4f}")
print(f"Average Test MSE: {avg_test_score_ply:.4f}")

Following part is for gradient descend for Multivariate Linear Regression question 7

In [174]:
def gradient_descend(X,y,learning_rate=0.01,epoch=1000):
    m,n = X.shape
    theta = np.random.randn(n,1) * 0.01
    for _ in range(epoch):
        y_pred = X.dot(theta)
        gradient_d = -2/m * X.T.dot(y - y_pred)  
        theta = theta - learning_rate*gradient_d
    return theta

In [175]:
def poly_kfold_desc(X,y,k=5):
    kf = KFold(n_splits=k, shuffle=True,random_state=42)
    test_scores = []
    train_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  
        theta2 = gradient_descend(X_train,y_train)
        y_train_pred = X_train @ theta2
        y_test_pred = X_test @ theta2
        train_scores.append(mean_squared_error(y_train_pred,y_train))
        test_scores.append(mean_squared_error(y_test_pred,y_test)) 
    return np.mean(train_scores),np.mean(test_scores)

In [176]:
X_feature_desc = (X_feature - np.mean(X_feature, axis=0)) / (np.std(X_feature, axis=0) + 1e-8)
X_poly_desc = np.hstack((X_feature_desc,X_bias))
y = y.reshape(-1,1)
avg_train_score_plydesc,avg_test_score_plydesc = poly_kfold_desc(X_poly_desc,y)

In [None]:
print(f"Average Training MSE: {avg_train_score_plydesc:.4f}")
print(f"Average Test MSE: {avg_test_score_plydesc:.4f}")

Following is Lasso Regression for gradient descend 

In [178]:
def lasso_model(X,y,learning_rate=0.01,l1_rate=0.1,epoch=1000):
    m,n = X.shape
    theta = np.random.randn(n,1) * 0.01
    for _ in range(epoch):
        y_pred = X.dot(theta)
        gradient_d = (-2/m) * X.T @ (y - y_pred)
        gradient_d += l1_rate * np.sign(theta)  
        theta = theta - learning_rate * gradient_d
    return theta

In [179]:
def lasso_kfold(X,y,k=5):
    kf = KFold(n_splits=k, shuffle=True,random_state=42)
    test_scores = []
    train_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  

        theta = lasso_model(X_train,y_train)
        
        y_train_pred = X_train @ theta 
        y_test_pred = X_test@ theta       
        train_scores.append(mean_squared_error(y_train_pred,y_train))
        test_scores.append(mean_squared_error(y_test_pred,y_test))
    return np.mean(train_scores),np.mean(test_scores)

In [None]:
X_scale = (X - np.mean(X, axis=0)) / (np.std(X, axis=0) + 1e-8)  
X_scale = np.hstack((X_scale,X_bias))
avg_lasso_test,avg_lasso_train = lasso_kfold(X_scale,y)
print(f"Average Training MSE: {avg_lasso_test:.4f}")
print(f"Average Test MSE: {avg_lasso_train:.4f}")

following is for elastic net implementation 

In [184]:
def elastic_net_model(X, y, epoch=1000, learning_rate=0.01, alpha=0.1, r=0.7):
    m, n = X.shape
    W = np.random.randn(n,1) * 0.01 
    for _ in range(epoch):
        y_pred = X.dot(W)
        gradient_d = (-2/m) * X.T @ (y - y_pred)
        l1_term = r*alpha*np.sign(W)
        l2_term = (1 - r) * alpha * W  
        gradient_d += l1_term + l2_term
        W = W - learning_rate * gradient_d
    return W

In [185]:
def elastic_net_kfold(X, y, k=5, learning_rate=0.01, alpha=0.1, r=0.7):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    test_scores = []
    train_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        theta = elastic_net_model(X_train,y_train)
        
        y_train_pred = X_train @ theta
        y_test_pred = X_test @ theta
        
        train_scores.append(mean_squared_error(y_train_pred, y_train))
        test_scores.append(mean_squared_error(y_test_pred, y_test))
    return np.mean(train_scores), np.mean(test_scores)

In [None]:
X_scaled = (X - np.mean(X, axis=0)) / (np.std(X, axis=0) + 1e-8)
X_scaled = np.hstack((X_scaled,X_bias))
avg_train_mseelas, avg_test_mseelas = elastic_net_kfold(X_scaled, y)
print(f"Average Training MSE: {avg_train_mseelas:.4f}")
print(f"Average Test MSE: {avg_test_mseelas:.4f}")

Question 10: I would choose gradient descend for linear regression with learning_rate = 0.01 and epoch = 1000 because it has the smallest average test MSE 