In [1]:
import pandas as pd
import numpy as np
import random
import math

from sklearn import preprocessing
from sklearn import linear_model
from sklearn import neighbors

df = pd.read_csv('airfoil_self_noise.dat', header=None, delimiter=r"\s+")

In [2]:
frac = 0.2 # The portion of data to set aside

n, p = df.shape

row_test= random.sample(range(n), math.floor(frac * n))
row_train = list(set(range(n)) - set(row_test))

X = preprocessing.normalize(df.values[:, :5], axis=0)
y = df.values[:, 5]
X_train, X_test = X[row_train], X[row_test]
y_train, y_test = y[row_train], y[row_test]

In [3]:
regr_lin = linear_model.LinearRegression()
regr_lin.fit(X_train, y_train)

error_lin = ((regr_lin.predict(X_test) - y_test) ** 2).mean()

error_lin

23.019854822456388

In [4]:
k_range = np.arange(1, 40, 1, dtype="int32")
n_k = len(k_range)
error_knn = np.empty(n_k, dtype="float64")

regr_knn = neighbors.KNeighborsRegressor()
for i_k, k in enumerate(k_range): 
    regr_knn.set_params(n_neighbors=k)
    regr_knn.fit(X_train, y_train)

    error_knn[i_k] = ((regr_knn.predict(X_test) - y_test) ** 2).mean()

error_knn

array([  7.38201846,   4.87730786,   5.79717721,   6.82806412,
         7.47128062,   8.29356822,   8.56738896,   8.61574721,
         8.79053942,   9.02319889,   9.52727474,   9.87724606,
        10.13052051,  10.61791904,  10.97849758,  11.31216465,
        11.7739808 ,  12.18672542,  12.24575304,  12.5101948 ,
        12.47674699,  12.66038275,  12.89347818,  13.08861046,
        13.3622567 ,  13.43983634,  13.67087334,  13.83028573,
        13.95561273,  14.18175032,  14.53323671,  14.7676174 ,
        14.6857516 ,  14.55885528,  14.59007247,  14.61042498,
        14.79983476,  14.78535414,  14.77865713])

In [5]:
class KernelSmootherBoxcar:
    def __init__(self, bandwidth = 1.0):
        self.bandwidth = bandwidth
        
    def set_params(self, **kargs):
        for key in kargs.keys():
            self.key = kargs[key]
            
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def predict(self, X_new):

        m = len(X_new)
        y_new = np.empty(m, dtype="float64")
        for i, x_new in enumerate(X_new):  
            distances = np.linalg.norm(x_new - self.X, axis=1)
            neighbors = (distances < bandwidth)
      
            if (sum(neighbors) == 0): # There is no neighbor within the bandwidth, use the nearest neighbor
                y_new[i] = self.y[np.argmin(distances)]
            else: # Average among the neighbors
                y_new[i] = self.y[neighbors].mean(axis=0)
        
        return y_new

bandwidth_range = np.arange(0.0, 0.02, 0.0004)
n_bandwidth = len(bandwidth_range)
error_ksb = np.empty(n_bandwidth, dtype="float64")

regr_ksb = KernelSmootherBoxcar()
for i_bandwidth, bandwidth in enumerate(bandwidth_range): 
    regr_ksb.set_params(bandwidth=bandwidth)
    regr_ksb.fit(X_train, y_train)
    
    error_ksb[i_bandwidth] = ((regr_ksb.predict(X_test) - y_test) ** 2).mean()

error_ksb

array([  7.38201846,   7.38021671,   7.10707938,   7.17372619,
         7.44747095,   8.2109912 ,   8.16803063,   8.90486894,
         9.2721564 ,   9.47775774,   9.5710692 ,   9.46088355,
        10.16971126,  10.9037823 ,  10.98890284,  11.26948464,
        11.03685425,  11.12877276,  11.1131264 ,  11.67512153,
         9.37754541,   9.9067364 ,   9.897622  ,  10.39238509,
        10.57590169,  10.70760606,  10.64263252,  11.01663705,
        11.06067867,  11.22463515,  11.30687898,  11.33201813,
        11.91867624,  12.37164944,  12.78906378,  13.03249362,
        13.28677367,  13.46305178,  13.86506495,  13.74446207,
        13.89484004,  14.11600054,  14.55730582,  14.9685549 ,
        15.21220033,  15.39540318,  15.70801289,  16.18328754,
        16.41244789,  16.66709295])

In [12]:
import matplotlib.pyplot as plt
%matplotlib qt

axis_font = {'size':'20'}
fig, axs = plt.subplots(2, 1)

axs[0].plot(k_range, error_knn, "+-", lw=2, markersize=5, markeredgewidth=2, label="KNN")
axs[0].plot(k_range, error_lin * np.ones(n_k), 'g', lw=2, label="Lin. Reg.")
axs[0].set_xlabel("$k$", **axis_font)
axs[0].set_ylabel("MSE", **axis_font)
axs[0].set_ylim([0, 25])
axs[0].grid(True)
axs[0].legend(loc=0, fontsize=12)

axs[1].plot(bandwidth_range, error_ksb, "r+-", lw=2, markersize=5, markeredgewidth=2, label="KSB")
axs[1].plot(bandwidth_range, error_lin * np.ones(n_bandwidth), 'g', lw=2, label="Lin. Reg.")
axs[1].set_xlabel("Bandwidth", **axis_font)
axs[1].set_ylabel("MSE", **axis_font)
axs[1].set_ylim([0, 25])
axs[1].grid(True)
axs[1].legend(loc=0, fontsize=12)

fig.tight_layout()