In [1]:
import seaborn as sns
import numpy as np
import math
from bokeh.layouts import gridplot
from bokeh.plotting import show
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from soydata.visualize import use_notebook, scatterplot


use_notebook()

In [2]:
def transform_to_poly(x, degree=5):
    x_poly = np.zeros((x.shape[0], degree))
    for j in range(degree):
        x_poly[:,j] = x ** (j+1)
    return x_poly

def make_quadratic_data():
    np.random.seed(0)
    x = np.linspace(0, 2, 100)
    x_train = x.reshape(-1,1)
    y_train = x ** 2 + 0.3 * (np.random.random_sample(x.shape[0]) - 0.5)
    return x, x_train, y_train

x, x_train, y_train = make_quadratic_data()

In [3]:
def train_model(x, x_train, y_train, model, title):
    model.fit(x_train, y_train)    
    y_pred = model.predict(x_train)

    print(f'Coef : {model.coef_}')

    p = scatterplot(x, y_train, height=400, width=400, size=3, title=title, show_inline=False)
    p = scatterplot(x, y_pred, p=p, color='red', size=1, show_inline=False)
    return p 

model = LinearRegression()
title = 'linear regression'
p = train_model(x, x_train, y_train, model, title)
show(p)

Coef : [1.9624822]


In [4]:
x_train2 = transform_to_poly(x, degree=2)
model = LinearRegression()
p0 = train_model(x, x_train, y_train, model, 'linear regression')
p1 = train_model(x, x_train2, y_train, model, 'quadratic regression')
gp0 = gridplot([[p0, p1]])
show(gp0)

Coef : [1.9624822]
Coef : [-0.08212841  1.0223053 ]


In [5]:
def make_noisy_linear_data(a=0.5, b=2, n_data=8):
    np.random.seed(6)
    x = np.random.random_sample(n_data)    
    y_train = a * x + b + 0.2 * (np.random.random_sample(n_data) - 0.5)
    return x, y_train

x, y_train = make_noisy_linear_data()

In [6]:
def train_overfitted_model(x, y_train, model, prefix, max_degree=6):
    figures = []
    for d in range(1, max_degree + 1):
        title = f'{prefix}, d={d}'

        x_train = transform_to_poly(x, d)
        model.fit(x_train, y_train)

        print(f'Coef (d={d}): {model.coef_}')

        x_ = np.linspace(x.min(), x.max(), 1000)
        x_test = transform_to_poly(x_, d)
        y_test = model.predict(x_test)

        p = scatterplot(x, y_train, height=300, width=300, size=5,
            title=f'{prefix}, d={d}', show_inline=False)
        p = scatterplot(x_, y_test, p=p, color='red', size=1, show_inline=False)
        figures.append(p)

    return figures

model = LinearRegression()
prefix = f'Linear'
figures = train_overfitted_model(x, y_train, model, prefix, max_degree=8)

Coef (d=1): [0.43307945]
Coef (d=2): [ 0.68020152 -0.26550572]
Coef (d=3): [ 0.58080279  0.00966002 -0.19596377]
Coef (d=4): [-0.61519563  5.59746788 -9.40998657  4.9265763 ]
Coef (d=5): [ -2.18525029  16.51080822 -39.47691884  40.55677045 -15.17050138]
Coef (d=6): [   1.79278013  -27.25212441  151.96435721 -348.46612558  353.57127515
 -131.90707495]
Coef (d=7): [   22.395192    -298.38631952  1686.79426763 -4753.32208068
  7079.04131693 -5333.93657778  1601.50759659]
Coef (d=8): [   15.49168051  -197.58596895  1009.7778105  -2295.04228351
  1932.60239394   878.63071322 -2412.03424012  1073.39805088]


In [7]:
n_cols = 3
n_rows = math.ceil(x.shape[0] / n_cols)

grid1 = [figures[i*n_cols:(i+1)*n_cols] for i in range(n_rows)]
gp1 = gridplot(grid1)
show(gp1)

In [8]:
model = Ridge(alpha=0.0001)
prefix = f'Ridge'
figures = train_overfitted_model(x, y_train, model, prefix, max_degree=8)

grid2 = [figures[i*n_cols:(i+1)*n_cols] for i in range(n_rows)]
gp2 = gridplot(grid2)
show(gp2)

Coef (d=1): [0.43301362]
Coef (d=2): [ 0.67822964 -0.26349794]
Coef (d=3): [ 0.57691797  0.01833673 -0.20131424]
Coef (d=4): [ 0.51991112  0.39106343 -0.95800161  0.4616992 ]
Coef (d=5): [ 0.42614703  0.70757923 -0.97944353 -0.40683216  0.7001062 ]
Coef (d=6): [ 0.38780145  0.77694396 -0.85069055 -0.52086057  0.19020787  0.49084293]
Coef (d=7): [ 0.38460465  0.77211646 -0.81927388 -0.50584638  0.14818769  0.37598434
  0.12584715]
Coef (d=8): [ 0.38324096  0.79415008 -0.85290649 -0.54638744  0.1530479   0.45345707
  0.28156133 -0.19665223]


In [9]:
model = SGDRegressor(alpha=0.0001)
prefix = f'SGD'
figures = train_overfitted_model(x, y_train, model, prefix, max_degree=8)

grid3 = [figures[i*n_cols:(i+1)*n_cols] for i in range(n_rows)]
gp3 = gridplot(grid3)
show(gp3)

Coef (d=1): [0.77202502]
Coef (d=2): [0.686825   0.42025865]
Coef (d=3): [0.65318051 0.39002102 0.26571578]
Coef (d=4): [0.6349212  0.37270989 0.25016031 0.18215283]
Coef (d=5): [0.62071835 0.36160005 0.24159565 0.17533807 0.13474961]
Coef (d=6): [0.61734989 0.35486028 0.23380279 0.16763038 0.1275815  0.10125389]
Coef (d=7): [0.61368747 0.351646   0.23075989 0.16468866 0.12477318 0.09862296
 0.08028466]
Coef (d=8): [0.6094477  0.34716972 0.22670947 0.16110078 0.12158765 0.09577325
 0.07771683 0.06437181]
