In [1]:
%matplotlib widget

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Multivariate Regression

We have a given dataset 

$$
    (X, y) \in \mathbb{R}^{m \times (n+1)},
$$

where $m$ is the number of samples and $n$ is the number of features.

In the dateset we have the features

$$
    X \in \mathbb{R}^{m \times n}
$$

and the targets 

$$
    y \in \mathbb{R}^m.
$$

We say that 

$$
    (X^{(i)}, y^{(i)}) \in \mathbb{R}^{1 \times (n+1)}
$$

is the $i$-th example ($i = 1, \dots, m$) and 

$$
    X_j \in \mathbb{R}^m
$$ 

is the $j$-th feature vector ($j = 1, \dots, n$), such that 

$$
    X^{(i)}_j \in \mathbb{R}  
$$

is the $j$-th feature of the $i$-th example.

## Linear Regression

In [2]:
def plot_reg_3d(X, y, theta=None, poly_degree=1):
    """Plot 3d regression problem. Optionally add regression plane.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, 2)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
        theta : ndarray of shape (2,) or shape (), default=None
            Weights for the regression plane. If None no regression plane is
            plotted.
        poly_degree : int, default=1
            Polynom degree.
    """
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(projection='3d')
    ax.scatter(X[:, 0], X[:, 1], y)
    if theta is not None:
        xx = np.linspace(min(X[:, 0]), max(X[:, 0]), 100)
        yy = np.linspace(min(X[:, 1]), max(X[:, 1]), 100)
        xx, yy = np.meshgrid(xx, yy)
        XX = np.column_stack([xx.ravel(), yy.ravel()])
        poly = PolynomialFeatures(poly_degree, include_bias=False)
        Z = hypo(poly.fit_transform(XX), theta[1:], theta[0])
        ax.plot_surface(xx, yy, Z.reshape(xx.shape), color='r', alpha=.3)
    plt.show()

In [3]:
# Make simple example
X, y = make_regression(n_samples=100, n_features=2, n_informative=2, noise=100,
                       random_state=0)
# Plot data
plot_reg_3d(X, y)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [4]:
# Split data in training and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((70, 2), (30, 2), (70,), (30,))

### Hypothesis

The hypothesis is given by

$$
    h_{w, b}(X) = \sum_{j=1}^n w_j X_j + b = w_1 X_1 + w_2 X_2 + \dots + w_n X_n + b, 
$$

where $w_1, w_2, \dots, w_n \in \mathbb{R}^n$, with $n$ the number of features, are the weights and $b \in \mathbb{R}$ is the bias.

We can also write

$$
    h_{w}(X) = \sum_{j=0}^n w_j X_j = w_0 + w_1 X_1 + w_2 X_2 + \dots + w_n X_n, 
$$

if we set $w_0 := b$ and $X_0 := 1$.

In [5]:
# Compute the hypothesis and return as shape (n_samples,)
hypo = lambda X, w, b: np.sum(w * X, axis=1) + b

### Cost function

The Mean Squared Error is given by

$$
\begin{align*}
    MSE(w,b) &= \frac{1}{m} \sum\limits_{i=1}^m (y^{(i)} - h_{w, b}(X^{(i)}))^2\\
             &= \frac{1}{m} \sum\limits_{i=1}^m (y^{(i)} - \sum_{j=1}^n w_j X_j^{(i)} + b)^2,
\end{align*}
$$

where $m$ is the number of samples and $n$ is the number of features.

### Gradient descent

#### Goal: 

$$
    \underset{w \in \mathbb{R}, ~ b \in \mathbb{R}}{\textbf{minimize}} MSE(w,b)
$$

#### Idea: 

$$
\begin{align*}
    w &= w - \alpha \cdot \frac{\partial}{\partial w} MSE(w,b) \\
    b &= b - \alpha \cdot \frac{\partial}{\partial b} MSE(w,b)
\end{align*}
$$

#### Partial derivaties:

$$
\begin{align*}
    \frac{\partial}{\partial w} MSE(w,b) &= \frac{2}{m} \sum\limits_{i=1}^m x^{(i)} (y^{(i)} - h_{w,b}(X^{(i)})) \\
    \frac{\partial}{\partial b} MSE(w,b) &= \frac{2}{m} \sum\limits_{i=1}^m y^{(i)} - h_{w,b}(X^{(i)})
\end{align*}
$$

In [6]:
def gradient_descent(X, y, w, b, alpha, num_iters):  
    """Simple gradient descent.
    
    TODO: Plot cost function.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, n_features)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
        w : float
            Weight.
        b : float
            Bias.
        alpha : float
            Stepsize.
        num_iter : int
            Number of iterations.
    
    Return
    ------
        w : float
            Updated weight.
        b : float
            Updated bias.
    """
    # Number of samples
    m = len(X)
    
    # Iteratively update the weight and bias
    for i in range(0, num_iters):
        
        # Compute predictions (for all samples)
        predictions = hypo(X, w, b)
        
        # Compute residuals (for all samples)
        residuals = predictions - y
        
        # Compute partial derivitaves 
        w_gradient = 2 * np.mean(residuals.reshape(-1, 1) * X, axis=0)
        b_gradient = 2 * np.mean(residuals)

        # Update weight and bias
        w = w - alpha * w_gradient
        b = b - alpha * b_gradient
        
    return w, b

In [7]:
def linear_regression(X, y, alpha, num_iters):
    """Linear regression using gradient descent.
    
    Parameter
    ---------
        X : ndarray of shpae (n_samples, n_features)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.        
        alpha : float
            Stepsize.
        num_iter : int
            Number of iterations.
    
    Return
    ------
        w : float
            Updated weight.
        b : float
            Updated bias.
    """
    w = np.zeros(X.shape[1])
    b = 0.0
    w, b = gradient_descent(X, y, w, b, alpha, num_iters)
    return w, b

In [8]:
# Apply linear regression with gradient descent
w, b = linear_regression(X_train, y_train, 0.0001, 10000)
theta = np.concatenate([[b], w])
w, b

(array([13.89911192, 96.48599232]), 2.14099390223388)

In [9]:
# Plot training set and regression plane
plot_reg_3d(X_train, y_train, theta)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
# Plot test set and regression plane
plot_reg_3d(X_test, y_test, theta)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
# Compute training and test MSE
(mean_squared_error(y_train, hypo(X_train, w, b)),
 mean_squared_error(y_test, hypo(X_test, w, b)))

(6992.743228798179, 14196.71422917607)

In [12]:
# Compute training and test R2
r2_score(y_train, hypo(X_train, w, b)), r2_score(y_test, hypo(X_test, w, b))

(0.6524664292520834, 0.3939626456684733)

### Normal equation

In [13]:
def normal_equation(X, y):
    """Normal equation for linear regression.
    
    Parameter
    ---------
        X : ndarray of shape (n_samples, n_features)
            Features.
        y : ndarray of shape (n_samples,)
            Labels.
    
    Return:
    -------
        theta : ndarray of shape (n_features + 1,)
            Weight(s) and bias.
    """
    X_hat = np.column_stack([np.ones(X.shape[0]), X])
    X_hat_trans = np.transpose(X_hat)
    theta = np.linalg.inv(np.dot(X_hat_trans, X_hat))
    theta = np.dot(np.dot(theta, X_hat_trans), y)
    return theta

In [14]:
# Apply the normal equation
theta = normal_equation(X_train, y_train)
w, b = theta[1:], theta[0]
w, b

(array([ 18.03914973, 108.23897766]), -0.30764114094373207)

In [15]:
# Plot training set and regression plane
plot_reg_3d(X_train, y_train, theta)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [16]:
# Plot test set and regression plane
plot_reg_3d(X_test, y_test, theta)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
# Compute training and test MSE
(mean_squared_error(y_train, hypo(X_train, w, b)),
 mean_squared_error(y_test, hypo(X_test, w, b)))

(6824.376355546086, 13924.170927444296)

In [18]:
# Compute training and test R2
r2_score(y_train, hypo(X_train, w, b)), r2_score(y_test, hypo(X_test, w, b))

(0.6608341239810972, 0.4055971280462938)

## Polynomial regression

### Hypothesis

TODO: Update

The hypothesis for (univariate) *polynomial regression* is given by

$$
    h_{w, b}(X) = \sum_{i=1}^h w_i X_i^i + b =  w_1 X + w_2 X^2 + \dots + w_h X^h + b, 
$$

where $h$ is the degree of the polynomial.

In [19]:
# Create polynomial features
poly = PolynomialFeatures(2, include_bias=False).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

# Show some samples
X_train[:5]

array([[-6.51025593e-01,  8.56830612e-01,  4.23834323e-01,
        -5.57818657e-01,  7.34158697e-01],
       [-6.89549778e-01, -8.03409664e-01,  4.75478896e-01,
         5.53990955e-01,  6.45467088e-01],
       [ 4.00157208e-01,  1.76405235e+00,  1.60125791e-01,
         7.05898262e-01,  3.11188068e+00],
       [-1.79924836e-01,  1.17877957e+00,  3.23729465e-02,
        -2.12091721e-01,  1.38952128e+00],
       [-1.87183850e-01,  4.57585173e-02,  3.50377937e-02,
        -8.56525544e-03,  2.09384191e-03]])

In [20]:
# Train linear regression
theta = normal_equation(X_train, y_train)
w, b = theta[1:], theta[0]
w, b

(array([ 15.11875246, 100.88800819,  -5.96802409,  -1.86176959,
         18.95112233]),
 -14.347459224554276)

In [21]:
# Plot training set and regression plane
plot_reg_3d(X_train, y_train, theta, 2)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
# Plot training set and regression plane
plot_reg_3d(X_train, y_train, theta, 2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
# Compute training and test MSE
(mean_squared_error(y_train, hypo(X_train, w, b)),
 mean_squared_error(y_test, hypo(X_test, w, b)))

(6111.20370302288, 13365.99308990835)

In [24]:
# Compute training and test R2
r2_score(y_train, hypo(X_train, w, b)), r2_score(y_test, hypo(X_test, w, b))

(0.6962782165756068, 0.4294249387950356)