In [1]:
import numpy as np
import pandas as pd

from bokeh.io import output_notebook, push_notebook, curdoc
from bokeh.plotting import figure, show
from bokeh.themes import Theme

In [2]:
output_notebook()

In [3]:
plot_theme = Theme("./theme.yml")

In [4]:
def f(x, c):
    return np.sin(4 * x) + c

In [27]:
data_size = 1000

In [28]:
x = np.linspace(0, np.pi/4, data_size)
y = f(x, 2) + np.random.normal(0, 1 / 3.0, data_size)

In [73]:
p = figure(plot_width=600, plot_height=600)
p.circle(x, y, size=10, alpha=0.2, color="#66D9EF", legend="y")
p.line(x, f(x, 2), color="#F92672", line_width=3, legend="Actual", line_dash="dashed")
doc = curdoc()
doc.theme = plot_theme
doc.add_root(p)
show(p)

In [30]:
def gaussian_kernel(x_, x0_, h_):
    return np.exp(- 0.5 * np.power((x_ - x0_) / h_, 2) )

# One Dimentional Kernel Smoother

In [39]:
def predict(x_test, x_train, y_train, bandwidth, kernel_func=gaussian_kernel):
    return np.array([(kernel_func(x_train, x0, bandwidth).dot(y_train) ) / 
                     kernel_func(x_train, x0, bandwidth).sum() for x0 in x_test])

In [40]:
h_values = [0.01, 0.1, 0.5]
colors = ["#A6E22E", "#FD971F", "#AE81FF"]

In [42]:
p = figure(plot_width=600, plot_height=600)
p.circle(x, y, size=10, alpha=0.2, color="#66D9EF", legend="y")
p.line(x, f(x, 2), color="#F92672", line_width=3, legend="Actual", line_dash="dashed")

for idx, h in enumerate(h_values):
    p.line(x, predict(x, x, y, h), color=colors[idx], line_width=1, legend="y_hat (h={})".format(h))
    
p.title.text = "Kernel Regression (Gaussian)"
p.xaxis.axis_label = "x"
p.yaxis.axis_label = "f(x)"

doc = curdoc()
doc.theme = plot_theme
doc.add_root(p)
show(p)

In [58]:
h_range = np.linspace(0.01, 0.5, 20)
mses = [np.power(y - predict(x, x, y, h), 2).sum() for h in h_range]

In [59]:
p = figure(plot_width=600, plot_height=300)
p.circle(x=h_range, y=mses, size=10, color="#66D9EF")
p.line(x=h_range, y=mses, color="#66D9EF", line_width=3)

p.title.text = "MSE vs Backwidth"
p.xaxis.axis_label = "Backwidth"
p.yaxis.axis_label = "MSE"

doc = curdoc()
doc.theme = plot_theme
doc.add_root(p)
show(p)

# Cross Validation

## Leave One Out Cross Validation (LOOCV)

## k-Fold Cross Validation 

In [78]:
def split_k_fold(x, y, folds):
    if len(x) != len(y):
        raise ValueError("X and Y Should have same length")
    indices = np.arange(len(x))
    np.random.shuffle(indices)
    split_size = len(x) / folds
    return np.array([x[n * split_size:(n + 1) * split_size] for n in np.arange(folds)]), np.array(
        [y[n * split_size:(n + 1) * split_size] for n in np.arange(folds)])

In [91]:
num_folds = 4

In [92]:
num_tries = 10

In [93]:
fold_indices  = np.arange(num_folds)
mse_values = []

for h in h_range:
    print("h = {}".format(h))
    trial_mses = []
    for trial in np.arange(num_tries):
        x_splits, y_splits = split_k_fold(x, y, num_folds)
        mses = []
        for idx in fold_indices:
            test_idx = idx
            train_idx = np.setdiff1d(fold_indices, [idx])
            train_x, test_x, train_y, test_y = (np.concatenate(x_splits[train_idx]), 
                                                x_splits[test_idx], 
                                                np.concatenate(y_splits[train_idx]), 
                                                y_splits[test_idx])
            test_y_hat = predict(test_x, train_x, train_y, h)
            mses.append(np.mean(np.power(test_y_hat - test_y, 2)))
        trial_mses.append(np.mean(mses))
    mse_values.append(np.mean(trial_mses))

h = 0.01
h = 0.0357894736842
h = 0.0615789473684
h = 0.0873684210526
h = 0.113157894737
h = 0.138947368421
h = 0.164736842105
h = 0.190526315789
h = 0.216315789474
h = 0.242105263158
h = 0.267894736842
h = 0.293684210526
h = 0.319473684211
h = 0.345263157895
h = 0.371052631579
h = 0.396842105263
h = 0.422631578947
h = 0.448421052632
h = 0.474210526316
h = 0.5


In [94]:
p = figure(plot_width=600, plot_height=300)
p.circle(x=h_range, y=mse_values, size=10, color="#66D9EF")
p.line(x=h_range, y=mse_values, color="#66D9EF", line_width=3)

p.title.text = "MSE vs Backwidth"
p.xaxis.axis_label = "Backwidth"
p.yaxis.axis_label = "MSE"

doc = curdoc()
doc.theme = plot_theme
doc.add_root(p)
show(p)