In [85]:
%matplotlib widget

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Multivariate Regression

We have a given dataset 

$$
    (x,y) \in \mathbb{R}^{m \times (n+1)},
$$

where $m$ is the number of samples and $n$ is the number of features.

In the dateset we have the features

$$
    x \in \mathbb{R}^{m \times n}
$$

and the targets 

$$
    y \in \mathbb{R}^m.
$$

We say that 

$$
    (x^{(i)}, y^{(i)}) \in \mathbb{R}^{1 \times (n+1)}
$$

is the $i$-th example ($i = 1, \dots, m$) and 

$$
    x_j \in \mathbb{R}^m
$$ 

is the $j$-th feature vector ($j = 1, \dots, n$), such that 

$$
    x^{(i)}_j \in \mathbb{R}  
$$

is the $j$-th feature of the $i$-th example.

## Linear Regression

In [86]:
# Plot data
def plot_data(X, y):
    """Plot 3d regression prblem.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, 2)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
    """
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(projection='3d')
    ax.scatter(X[:, 0], X[:, 1], y)
    return fig, ax

In [87]:
# Make simple example
X, y = make_regression(n_samples=100, n_features=2, n_informative=2, noise=100,
                       random_state=0)
# Plot data
plot_data(X, y)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [88]:
# Split data in training and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((70, 2), (30, 2), (70,), (30,))

### Hypothesis

The hypothesis is given by

$$
    h_{w, b}(X) = \sum_{j=1}^n w_j X_j + b = w_1 X_1 + w_2 X_2 + \dots + w_n X_n + b, 
$$

where $w_1, w_2, \dots, w_n \in \mathbb{R}^n$, with $n$ the number of features, are the weights and $b \in \mathbb{R}$ is the bias.

We can also write

$$
    h_{w}(X) = \sum_{j=0}^n w_j X_j = w_0 + w_1 X_1 + w_2 X_2 + \dots + w_n X_n, 
$$

if we set $w_0 := b$ and $X_0 := 1$.

In [89]:
# Compute the hypothesis and return as shape (n_samples,)
hypo = lambda X, w, b: np.sum(w * X, axis=1) + b

### Cost function

The Mean Squared Error is given by

$$
\begin{align*}
    MSE(w,b) &= \frac{1}{m} \sum\limits_{i=1}^m (y^{(i)} - h_{w, b}(X^{(i)}))^2\\
             &= \frac{1}{m} \sum\limits_{i=1}^m (y^{(i)} - \sum_{j=1}^n w_j X_j^{(i)} + b)^2,
\end{align*}
$$

where $m$ is the number of samples and $n$ is the number of features.

### Gradient descent

#### Goal: 

$$
    \underset{w \in \mathbb{R}, ~ b \in \mathbb{R}}{\textbf{minimize}} MSE(w,b)
$$

#### Idea: 

$$
\begin{align*}
    w &= w - \alpha \cdot \frac{\partial}{\partial w} MSE(w,b) \\
    b &= b - \alpha \cdot \frac{\partial}{\partial b} MSE(w,b)
\end{align*}
$$

#### Partial derivaties:

$$
\begin{align*}
    \frac{\partial}{\partial w} MSE(w,b) &= \frac{2}{m} \sum\limits_{i=1}^m x^{(i)} (y^{(i)} - h_{w,b}(X^{(i)})) \\
    \frac{\partial}{\partial b} MSE(w,b) &= \frac{2}{m} \sum\limits_{i=1}^m y^{(i)} - h_{w,b}(X^{(i)})
\end{align*}
$$

In [90]:
def gradient_descent(X, y, w, b, alpha, num_iters):  
    """Simple gradient descent.
    
    TODO: Plot cost function.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, n_features)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
        w : float
            Weight.
        b : float
            Bias.
        alpha : float
            Stepsize.
        num_iter : int
            Number of iterations.
    
    Return
    ------
        w : float
            Updated weight.
        b : float
            Updated bias.
    """
    # Number of samples
    m = len(X)
    
    # Iteratively update the weight and bias
    for i in range(0, num_iters):
        
        # Compute predictions (for all samples)
        predictions = hypo(X, w, b)
        
        # Compute residuals (for all samples)
        residuals = predictions - y
        
        # Compute partial derivitaves 
        w_gradient = 2 * np.mean(residuals.reshape(-1, 1) * X, axis=0)
        b_gradient = 2 * np.mean(residuals)

        # Update weight and bias
        w = w - alpha * w_gradient
        b = b - alpha * b_gradient
        
    return w, b

In [91]:
def linear_regression(X, y, alpha, num_iters):
    """Linear regression using gradient descent.
    
    Parameter
    ---------
        X : ndarray of shpae (n_samples, n_features)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.        
        alpha : float
            Stepsize.
        num_iter : int
            Number of iterations.
    
    Return
    ------
        w : float
            Updated weight.
        b : float
            Updated bias.
    """
    w = np.zeros(X.shape[1])
    b = 0.0
    w, b = gradient_descent(X, y, w, b, alpha, num_iters)
    return w, b

In [92]:
# Apply linear regression with gradient descent
w, b = linear_regression(X_train, y_train, 0.0001, 10000)
w, b

(array([ 8.86580537, 97.08947463]), -3.9879246863309175)

In [93]:
def plot_data_reg_plane(X, y, w, b, poly_degree=1):
    """Plot 3d regression problem and regression plane.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, 2)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
        w : ndarray of shape (2,)
            Weight.
        b : float
            Bias.
        poly_degree : int, default=1
            Polynom degree.
    """
    fig, ax = plot_data(X, y)
    xx = np.linspace(min(X[:, 0]), max(X[:, 0]), 100)
    yy = np.linspace(min(X[:, 1]), max(X[:, 1]), 100)
    xx, yy = np.meshgrid(xx, yy)
    zz = np.column_stack([xx.ravel(), yy.ravel()])
    zz = PolynomialFeatures(poly_degree, include_bias=False).fit_transform(zz)
    zz = hypo(zz, w, b)
    ax.plot_surface(xx, yy, zz.reshape(xx.shape), color='r', alpha=.3)

In [94]:
# Plot training set and regression plane
plot_data_reg_plane(X_train, y_train, w, b)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [95]:
# Plot test set and regression plane
plot_data_reg_plane(X_test, y_test, w, b)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [96]:
# Compute training and test MSE
(mean_squared_error(y_train, hypo(X_train, w, b)),
 mean_squared_error(y_test, hypo(X_test, w, b)))

(9739.291151365584, 8055.034961372016)

In [97]:
# Compute training and test R2
r2_score(y_train, hypo(X_train, w, b)), r2_score(y_test, hypo(X_test, w, b))

(0.5714267338738095, 0.5551803374565513)

### Normal equation

In [98]:
def normal_equation(X, y):
    """Normal equation for linear regression.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, n_features)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
    
    Return:
    -------
        theta : ndarray of shape (n_features + 1,)
            Weight(s) and bias.
    """
    X_hat = np.column_stack([np.ones(X.shape[0]), X])
    X_hat_trans = np.transpose(X_hat)
    theta = np.linalg.inv(np.dot(X_hat_trans, X_hat))
    theta = np.dot(np.dot(theta, X_hat_trans), y)
    return theta

In [103]:
# Apply the normal equation
theta = normal_equation(X_train, y_train)
w, b = theta[1:], theta[0]
w, b

(array([  9.79488969, 109.8894111 ]), -7.211176719046206)

In [104]:
# Plot training set and regression plane
plot_data_reg_plane(X_train, y_train, w, b)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [105]:
# Plot test set and regression plane
plot_data_reg_plane(X_test, y_test, w, b)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [106]:
# Compute training and test MSE
(mean_squared_error(y_train, hypo(X_train, w, b)),
 mean_squared_error(y_test, hypo(X_test, w, b)))

(9556.393293716474, 8138.52818247132)

In [107]:
# Compute training and test R2
r2_score(y_train, hypo(X_train, w, b)), r2_score(y_test, hypo(X_test, w, b))

(0.5794750744564987, 0.5505696279298813)

## Polynomial regression

### Hypothesis

TODO: Update

The hypothesis for (univariate) *polynomial regression* is given by

$$
    h_{w, b}(X) = \sum_{i=1}^h w_i X_i^i + b =  w_1 X + w_2 X^2 + \dots + w_h X^h + b, 
$$

where $h$ is the degree of the polynomial.

In [108]:
# Create polynomial features
poly = PolynomialFeatures(2, include_bias=False).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

# Show some samples
X_train[:5]

array([[ 0.53924919, -0.76991607,  0.29078969, -0.41517662,  0.59277076],
       [ 0.12167502,  0.76103773,  0.01480481,  0.09259928,  0.57917842],
       [ 1.45427351,  0.14404357,  2.11491143,  0.20947875,  0.02074855],
       [-0.43782004, -1.14746865,  0.19168639,  0.50238478,  1.31668431],
       [-1.60205766,  0.62523145,  2.56658873, -1.00165683,  0.39091437]])

In [109]:
# Train linear regression
theta = normal_equation(X_train, y_train)
w, b = theta[1:], theta[0]
w, b

(array([8.82544018e+00, 1.01167739e+02, 5.09069654e-03, 2.30228501e+00,
        2.43514531e+01]),
 -32.963612410449706)

In [110]:
# Plot training set and regression plane
plot_data_reg_plane(X_train, y_train, w, b, 2)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [111]:
# Plot training set and regression plane
plot_data_reg_plane(X_train, y_train, w, b, 2)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [112]:
# Compute training and test MSE
(mean_squared_error(y_train, hypo(X_train, w, b)),
 mean_squared_error(y_test, hypo(X_test, w, b)))

(8526.753129079561, 7872.476713596308)

In [113]:
# Compute training and test R2
r2_score(y_train, hypo(X_train, w, b)), r2_score(y_test, hypo(X_test, w, b))

(0.6247839415429167, 0.5652616714991141)