In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# mse sklearn function

In [2]:
X = pd.read_csv('lung_cancer_data.csv').to_numpy()
y = pd.read_csv('lung_cancer_label.csv').to_numpy()

In [3]:
# Split data into train validation and test
[X_train, X_rest, y_train, y_rest] = train_test_split(X, y, test_size=0.2, random_state=0)
[X_validation, X_test, y_validation, y_test] = train_test_split(X_rest, y_rest, test_size=0.2, random_state=0)

## Kernel Regression

In [4]:
# Forming Kernel Matrix using Gaussian Kernel
def k(x_i, x_j, γ):
    norm_sq = (np.linalg.norm(x_i - x_j))**2
    kernel = math.e ** (-γ * norm_sq)
    return kernel

kernel_matrix_train = np.array([])
for x_i in X_train:
    row = []
    for x_j in X_train:
        k_x_x = k(x_i , x_j, 0.005)
        row.append(k_x_x)
    if len(kernel_matrix_train) == 0:
        kernel_matrix_train = np.array([row])
    else:
        kernel_matrix_train = np.vstack((kernel_matrix_train, row))

In [6]:
alpha = np.linalg.inv(np.dot(kernel_matrix_train.T,kernel_matrix_train)).dot(kernel_matrix_train.T.dot(y_train))

In [16]:
# X_validation Kernel Matrix for predictions

kernel_matrix_validation = np.array([])
for x_i in X_validation:
    row = []
    for x_j in X_train:
        k_x_x = k(x_i , x_j, 0.005)
        row.append(k_x_x)
    if len(kernel_matrix_validation) == 0:
        kernel_matrix_validation = np.array([row])
    else:
        kernel_matrix_validation = np.vstack((kernel_matrix_validation, row))


# X_test Kernel Matrix for predictions

kernel_matrix_test = np.array([])
for x_i in X_test:
    row = []
    for x_j in X_train:
        k_x_x = k(x_i , x_j, 0.005)
        row.append(k_x_x)
    if len(kernel_matrix_test) == 0:
        kernel_matrix_test = np.array([row])
    else:
        kernel_matrix_test = np.vstack((kernel_matrix_test, row))

In [17]:
# Dot with k matrix for predictions
y_pred_train = np.dot(kernel_matrix_train, alpha)
y_pred_validation = np.dot(kernel_matrix_validation, alpha)
y_pred_test = np.dot(kernel_matrix_test, alpha)


In [19]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_validation = mean_squared_error(y_validation, y_pred_validation)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Final MSE on Training Data: {mse_train}")
print(f"Final MSE on Validation Data: {mse_validation}")
print(f"Final MSE on Test Data: {mse_test}")

Final MSE on Training Data: 23.568607399454038
Final MSE on Validation Data: 23.699433201757387
Final MSE on Test Data: 24.390900358877346


## Kernel Ridge Regression

In [28]:
# X_validation Kernel Matrix for predictions

kernel_matrix_validation = np.array([])
for x_i in X_validation:
    row = []
    for x_j in X_validation:
        k_x_x = k(x_i , x_j, 0.005)
        row.append(k_x_x)
    if len(kernel_matrix_validation) == 0:
        kernel_matrix_validation = np.array([row])
    else:
        kernel_matrix_validation = np.vstack((kernel_matrix_validation, row))


# X_test Kernel Matrix for predictions

kernel_matrix_test = np.array([])
for x_i in X_test:
    row = []
    for x_j in X_test:
        k_x_x = k(x_i , x_j, 0.005)
        row.append(k_x_x)
    if len(kernel_matrix_test) == 0:
        kernel_matrix_test = np.array([row])
    else:
        kernel_matrix_test = np.vstack((kernel_matrix_test, row))

In [31]:
λ = 0.01
kernel_ridge_y_pred_train = kernel_matrix_train.dot(np.linalg.inv(kernel_matrix_train + λ*np.eye(399))).dot(y_train)
kernel_ridge_y_pred_test = kernel_matrix_test.dot(np.linalg.inv(kernel_matrix_test + λ*np.eye(20))).dot(y_test)
kernel_ridge_y_pred_validation = kernel_matrix_validation.dot(np.linalg.inv(kernel_matrix_validation + λ*np.eye(80))).dot(y_validation)

In [32]:
mse_train = mean_squared_error(y_train, kernel_ridge_y_pred_train)
mse_validation = mean_squared_error(y_validation, kernel_ridge_y_pred_validation)
mse_test = mean_squared_error(y_test, kernel_ridge_y_pred_test)

print(f"Final MSE on Training Data: {mse_train}")
print(f"Final MSE on Validation Data: {mse_validation}")
print(f"Final MSE on Test Data: {mse_test}")

Final MSE on Training Data: 0.0094983842413568
Final MSE on Validation Data: 0.012422415275365981
Final MSE on Test Data: 0.13438291755141912
