# DS-SF-38 | 11 | Regularization | Codealong | Starter Code

In [None]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import preprocessing, linear_model, model_selection, metrics

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

### Truth

In [None]:
def f(x):
    return np.sin(2 * np.pi * x)

In [None]:
truth_df = pd.DataFrame(index = range(100))
truth_df['x'] = np.linspace(0, 1, truth_df.shape[0])
truth_df['y'] = truth_df.x.apply(f)

### Training/testing sets

- (http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:
np.random.seed(0)

df = pd.DataFrame(index = range(100))
df['x'] = np.random.uniform(0, 1, size = df.shape[0])

df['Noise'] = np.random.normal(size = df.shape[0])
df['y'] = df.x.apply(f) * (1 + .5 * df.Noise)

In [None]:
X = df[ ['x'] ]
y = df.y

In [None]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y, train_size = .25)

In [None]:
plt.figure()
plt.xlim((truth_df.x.min(), truth_df.x.max()))
plt.ylim((-2, 2))

plt.plot(truth_df.x, truth_df.y, color = 'green', label = 'truth')
plt.scatter(train_X.x, train_y, s = 10, label = 'train')

plt.legend()

### Polynomial Models

- (http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html)
- (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html)

In [None]:
class Polynomial(object):
    def __init__(self, n):
        # TODO

    def fit(self, X, y):
        # TODO

    def predict(self, X):
        # TODO

    def score(self, X, y):
        # TODO

    def mean_squared_error(self, X, y):
        # TODO

In [None]:
ns = [0, 1, 2, 3, 5, 7, 11, 13, 17, 19]

models = pd.Series([Polynomial(n).fit(train_X, train_y) for n in ns], index = ns)

In [None]:
plt.figure(figsize = (12, 12))
plt.xlim((truth_df.x.min(), truth_df.x.max()))
plt.ylim((-2, 2))

plt.plot(truth_df.x, truth_df.y, color = 'green', label = 'truth')
plt.scatter(train_X.x, train_y, s = 10, label = 'train')

for n in ns:
    X = truth_df[ ['x'] ]
    y_hat = models[n].predict(X)
    plt.plot(X.x, y_hat, label = 'n = {}'.format(n))

plt.legend()

### $R^2$

In [None]:
metrics_df = pd.DataFrame(index = models.index)

metrics_df['R^2 (train)'] = models.apply(lambda model: model.score(train_X, train_y))
metrics_df['R^2 (test)'] = models.apply(lambda model: model.score(test_X, test_y))

metrics_df

### Bias-Variance Trade-off

In [None]:
metrics_df = pd.DataFrame(index = models.index)

metrics_df['MSE (train)'] = models.apply(lambda model: model.mean_squared_error(train_X, train_y))
metrics_df['MSE (test)'] = models.apply(lambda model: model.mean_squared_error(test_X, test_y))

metrics_df.plot(logy = True)

### Regularization

- (http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)
- (http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)

#### Ridge

In [None]:
n = 19
alphas = [1e-1, 1e-2, 1e-3, 1e-5, 1e-7, 1e-11, 1e-13, 1e-17, 1e-19]

In [None]:
class PolynomialRidge(object):

    @staticmethod
    def coefficients_names(n):
        return map(lambda i: 'beta_{}'.format(i), range(n + 1))

    def __init__(self, alpha, n):
        self.alpha = alpha
        self.n = n

        self.features = preprocessing.PolynomialFeatures(self.n)
        self.model = linear_model.Ridge(self.alpha)

    def fit(self, X, y):
        self.X = X
        self.features.fit(X)

        X = self.features.transform(X)
        self.model.fit(X, y)

        self.coefficients = pd.Series(self.model.coef_, index = self.__class__.coefficients_names(self.n))
        self.complexity = (self.coefficients ** 2).sum() ** .5

        return self

    def predict(self, X):
        X = self.features.transform(X)
        return self.model.predict(X)

    def score(self, X, y):
        X = self.features.transform(X)
        return self.model.score(X, y)

    def mean_squared_error(self, X, y):
        y_hat = self.predict(X)
        return metrics.mean_squared_error(y, y_hat)

In [None]:
models = pd.Series([PolynomialRidge(alpha, n).fit(train_X, train_y) for alpha in alphas], index = alphas)

In [None]:
plt.figure(figsize = (12, 12))
plt.xlim((truth_df.x.min(), truth_df.x.max()))
plt.ylim((-2, 2))

plt.plot(truth_df.x, truth_df.y, color = 'green', label = 'truth')
plt.scatter(train_X.x, train_y, s = 10, label = 'train')

for alpha in alphas:
    X = truth_df[ ['x'] ]
    y_hat = models[alpha].predict(X)
    plt.plot(X.x, y_hat, label = 'alpha = {}'.format(alpha))

plt.legend()

In [None]:
metrics_df = pd.DataFrame(index = models.index)

metrics_df['R^2 (train)'] = models.apply(lambda model: model.score(train_X, train_y))
metrics_df['R^2 (test)'] = models.apply(lambda model: model.score(test_X, test_y))
metrics_df['Complexity'] = models.apply(lambda model: model.complexity)

metrics_df

In [None]:
coefficients_df = pd.DataFrame(columns = PolynomialRidge.coefficients_names(n))

for alpha in alphas:
    model = models[alpha]
    coefficients_df.loc[model.complexity, :] = model.coefficients

In [None]:
coefficients_df

In [None]:
coefficients_df.plot(logx = True, legend = False, figsize = (12, 12))

#### Lasso

In [None]:
class PolynomialLasso(object):

    @staticmethod
    def coefficients_names(n):
        return map(lambda i: 'beta_{}'.format(i), range(n + 1))

    def __init__(self, alpha, n):
        self.alpha = alpha
        self.n = n

        self.features = preprocessing.PolynomialFeatures(self.n)
        self.model = linear_model.Lasso(self.alpha)

    def fit(self, X, y):
        self.X = X
        self.features.fit(X)

        X = self.features.transform(X)
        self.model.fit(X, y)

        self.coefficients = pd.Series(self.model.coef_, index = self.__class__.coefficients_names(self.n))
        self.complexity = (self.coefficients ** 2).sum() ** .5

        return self

    def predict(self, X):
        X = self.features.transform(X)
        return self.model.predict(X)

    def score(self, X, y):
        X = self.features.transform(X)
        return self.model.score(X, y)

    def mean_squared_error(self, X, y):
        y_hat = self.predict(X)
        return metrics.mean_squared_error(y, y_hat)

In [None]:
models = pd.Series([PolynomialLasso(alpha, n).fit(train_X, train_y) for alpha in alphas], index = alphas)

In [None]:
plt.figure(figsize = (12, 12))
plt.xlim((truth_df.x.min(), truth_df.x.max()))
plt.ylim((-2, 2))

plt.plot(truth_df.x, truth_df.y, color = 'green', label = 'truth')
plt.scatter(train_X.x, train_y, s = 10, label = 'train')

for alpha in alphas:
    X = truth_df[ ['x'] ]
    y_hat = models[alpha].predict(X)
    plt.plot(X.x, y_hat, label = 'alpha = {}'.format(alpha))

plt.legend()

In [None]:
metrics_df = pd.DataFrame(index = models.index)

metrics_df['R^2 (train)'] = models.apply(lambda model: model.score(train_X, train_y))
metrics_df['R^2 (test)'] = models.apply(lambda model: model.score(test_X, test_y))
metrics_df['Complexity'] = models.apply(lambda model: model.complexity)

metrics_df

In [None]:
coefficients_df = pd.DataFrame(columns = PolynomialRidge.coefficients_names(n))

for alpha in alphas:
    model = models[alpha]
    coefficients_df.loc[model.complexity, :] = model.coefficients

In [None]:
coefficients_df

In [None]:
coefficients_df.plot(logx = True, legend = False, figsize = (12, 12))