# 1. Supervised learning

## 1.1 Linear Models

$$
\hat{y}(w,x) = w_0 + w_1 x_1 + \cdots + w_p x_p
$$

### 1.1.1. Ordinary Least Squares

$$
\min_{w} \|Xw - y\|_2^2
$$

In [1]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit([[0,0],[1,1],[2,2]],[0,1,2])

LinearRegression()

In [2]:
reg.coef_

array([0.5, 0.5])

### 1.1.2. Ridge regression and classification

$$
    \min_{w} \| Xw - y \|_2^2 + \alpha \| w \|_2^2
$$

In [3]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=0.5)
reg.fit([[0,0],[0,0],[1,1]],[0,0.1,1])


Ridge(alpha=0.5)

In [4]:
reg.coef_

array([0.34545455, 0.34545455])

In [5]:
reg.intercept_

0.1363636363636364

In [6]:
import numpy as np
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=np.logspace(-6,6,13))
reg.fit([[0,0],[0,0],[1,1]],[0,0.1,1])


RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]))

In [7]:
reg.alpha_

0.01

### 1.1.3. Lasso

$$
    \min_w \frac{1}{2 n_{\text{samples}}} \| Xw - y \|_2^2 + \alpha \| w \|_1
$$

In [8]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit([[0,0],[1,1]],[0,1])


Lasso(alpha=0.1)

In [9]:
reg.predict([[1,1]])

array([0.8])

### 1.1.4. Multi-task Lasso

$$
    \min_{W} \frac{1}{2n_{\text{samples}}} \| XW-Y \|_{\operatorname{Frob}}^{2} + \alpha \| W \|_{21} \\
    \| A \|_{\operatorname{Frob}} = \sqrt{\sum_{ij} a_{ij}^2} \\
    \| A \|_{21} = \sum_{i} \sqrt{\sum_{j} a_{ij}^2 }
$$

### 1.1.5. ElasticNet

$$
    \min_{w} \frac{1}{2n_{\text{samples}}} \| Xw-y \|_{2}^{2} + \alpha \rho \| w \|_{1} + \frac{\alpha(1 - \rho)}{2} \| w \|_2^2
$$

### 1.1.6. Multitask ElasticNet

$$
        \min_{W} \frac{1}{2n_{\text{samples}}} \| XW-Y \|_{\operatorname{Frob}}^{2} + \alpha \rho \| W \|_{21} + \frac{\alpha(1 - \rho)}{2} \| W \|_{\operatorname{Frob}}^{2}
$$

### 1.1.7. Least Angle Regression

### 1.1.8. LARS Lasso

In [10]:
from sklearn import linear_model
reg = linear_model.LassoLars(alpha=0.1)
reg.fit([[0,0], [1,1]], [0,1])


LassoLars(alpha=0.1)

In [11]:
reg.coef_

array([0.71715729, 0.        ])

### 1.1.9. Orthogonal Matching Pursuit (OMP)

$$
    \underset{w}{\operatorname{arg min}} \| y - Xw \|_2^2 \quad \text{subject to} \quad \|w \|_0 \leq n_{\text{nonzero\_coefs}} \\
    \underset{w}{\operatorname{arg min}} \| w \|_0 \quad \text{subject to} \quad \|y - Xw \|_2^2 \leq \text{tol} \\
$$

### 1.1.10. Bayesian Regression

$$
    p(y | X,w,\alpha) = \mathcal{N}(y | Xw, \alpha) \\
    p(w|\lambda) = \mathcal{N}(w | 0, \lambda^{-1} \mathbf{I}_p)
$$

In [12]:
from sklearn import linear_model
X = [[0., 0.], [1.,1.], [2.,2.], [3.,3.]]
Y = [0.,1.,2.,3.]
reg = linear_model.BayesianRidge()
reg.fit(X, Y)

BayesianRidge()

In [13]:
reg.predict([[1, 0.]])

array([0.50000013])

In [14]:
reg.coef_

array([0.49999993, 0.49999993])

$$
    p(w | \lambda) = \mathcal{N}(w|0,A^{-1})
$$

### 1.1.11. Logistic regression

$$
    \min_{w,c} \frac{1}{2} w^{\top}w + C \sum_{i=1}^{n} \log(\exp(-y_i (X_i^{\top}w + c)) + 1) \\
    \min_{w,c} \|w \|_1 + C \sum_{i=1}^{n} \log(\exp(-y_i (X_i^{\top}w + c)) + 1) \\
    \min_{w,c} \frac{1 - \rho}{2} w^{\top}w + \rho\|w \|_1 + C \sum_{i=1}^{n} \log(\exp(-y_i (X_i^{\top}w + c)) + 1)
$$

### 1.1.12. Generalized Linear Regression

$$
    \hat{y}(w,X)=h(Xw) \\
    \min_w \frac{1}{2n_{\text{samples}}} \sum_{i} d(y_i, \hat{y}_i) + \frac{\alpha}{2} \|w \|_2
$$

In [15]:
from sklearn.linear_model import TweedieRegressor
reg = TweedieRegressor(power=1, alpha=0.5, link='log')
reg.fit([[0,0],[0,1],[2,2]], [0,1,2])


TweedieRegressor(alpha=0.5, link='log', power=1)

In [16]:
reg.coef_

array([0.24631611, 0.43370317])

In [17]:
reg.intercept_

-0.7638091359123443

### 1.1.13. Stochastic Gradient Descent - SGD

### 1.1.14. Perceptron

### 1.1.15. Passive Aggressive Algorithms

### 1.1.16. Robustness regression: outliers and modelling errors

$$
    \min_{w,\sigma} \sum_{i=1}^n \left( \sigma + H_{\epsilon} \left( \frac{X_i w - y_i}{\sigma} \right) \sigma \right) + \alpha \| w \|_2^2 \\
    H_{\epsilon}(z) = \begin{cases} z^2 & \text{if $|z| < \epsilon$,} \\ 2\epsilon |z| - \epsilon^2 & \text{otherwise} \end{cases}
$$

### 1.1.17 Polynomial regression: extending linear models with basis functions

$$
    \hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 \\
    \hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 + w_3 x_1 x_2 + w_4 x_1^2 + w_5 x_2^2 \\
    z = [x_1, x_2, x_1 x_2, x_1^2, x_2^2] \\
    \hat{y}(w, x) = w_0 + w_1 z_1 + w_2 z_2 + w_3 z_3 + w_4 z_4 + w_5 z_5 \\
$$

In [18]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
X = np.arange(6).reshape(3,2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [19]:
poly = PolynomialFeatures(degree=2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
model = Pipeline([('poly', PolynomialFeatures(degree=3)),
                 ('linear', LinearRegression(fit_intercept=False))])
x = np.arange(5)
y = 3 - (2 * x) + (x ** 2) - (x ** 3)
model = model.fit(x[:, np.newaxis], y)
model.named_steps['linear'].coef_

array([ 3., -2.,  1., -1.])

In [21]:
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = X[:, 0] ^ X[:, 1]
y

array([0, 1, 1, 0])

In [22]:
X = PolynomialFeatures(interaction_only=True).fit_transform(X).astype(int)
X

array([[1, 0, 0, 0],
       [1, 0, 1, 0],
       [1, 1, 0, 0],
       [1, 1, 1, 1]])

In [23]:
clf = Perceptron(fit_intercept=False, max_iter=10, tol=None, shuffle=False).fit(X, y)

In [24]:
clf.predict(X)

array([0, 1, 1, 0])

In [25]:
clf.score(X, y)

1.0