In [1]:
import numpy as np

In [2]:
num_bread = 1000

In [3]:
# Making imaginary bread.

# Weights: 50 grams, with a standard deviation of 3 grams.
weights = 50 + 3*np.random.randn(num_bread, 1)

# Temperature: 22 degrees, with a standard deviation of 5 degrees.
temps = 22 + 5*np.random.randn(num_bread, 1)

# Volume in ml: 3ml per gram of weight, plus 4ml per degree above 22, 
# plus some noise.
volume = 3*weights + 4*(temps - 22) + 5*np.random.randn(num_bread, 1)

In [4]:
X = np.c_[weights, temps]
y = volume

In [5]:
X.shape, y.shape

((1000, 2), (1000, 1))

In [6]:
# Splitting the data into training and test sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

# Here is where the model learns its optimal parameters.
model.fit(X_train, y_train)

In [8]:
model.intercept_, model.coef_

(array([-88.19874467]), array([[3.02088605, 3.96619432]]))

O modelo final é:

$$
\hat{y} = \overset{\theta_0}{-87.71} + \overset{\theta_1}{2.96} x_1 + \overset{\theta_2}{4.06} x_2
$$

In [9]:
y_pred = model.predict(X_test)

In [10]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_test, y_pred)

np.float64(4.792218551955046)

Equivalently, I could use the *normal equation* for fitting: 

In [11]:
X_train_augmented = np.c_[np.ones((X_train.shape[0], 1)), X_train]

In [12]:
theta_opt = np.linalg.inv(X_train_augmented.T @ X_train_augmented) @ X_train_augmented.T @ y_train

In [13]:
theta_opt

array([[-88.19874467],
       [  3.02088605],
       [  3.96619432]])

In [14]:
model.intercept_, model.coef_

(array([-88.19874467]), array([[3.02088605, 3.96619432]]))

Suppose the baker made a mistake and duplicated a data column:

In [15]:
X = np.c_[weights, weights, temps]
y = volume

In [16]:
X[:5,:]

array([[48.81505624, 48.81505624, 23.91401116],
       [47.83720967, 47.83720967, 24.52005226],
       [48.51814048, 48.51814048, 22.19029558],
       [54.51997014, 54.51997014, 25.21261495],
       [47.71147745, 47.71147745, 22.74758811]])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [18]:
model = LinearRegression()

model.fit(X_train, y_train)

In [19]:
model.intercept_, model.coef_

(array([-88.2777002]),
 array([[ 1.97145025e+12, -1.97145025e+12,  3.96612549e+00]]))

In [20]:
X_train_augmented = np.c_[np.ones((X_train.shape[0], 1)), X_train]

In [21]:
np.linalg.inv(X_train_augmented.T @ X_train_augmented) @ X_train_augmented.T @ y_train

LinAlgError: Singular matrix

In [22]:
M = X_train_augmented.T @ X_train_augmented

In [23]:
np.linalg.det(M)

np.float64(0.0)

In [24]:
U, S, Vt = np.linalg.svd(X_train_augmented)

In [25]:
X_train_augmented.shape

(800, 4)

In [26]:
U.shape, S.shape, Vt.shape

((800, 800), (4,), (4, 4))

Another example

In [27]:
temps_F = 1.8*temps + 32

In [28]:
X = np.c_[weights, temps, temps_F]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [30]:
X_train_augmented = np.c_[np.ones((X_train.shape[0], 1)), X_train]

In [31]:
X_train_augmented[:5,:]

array([[ 1.        , 51.76487205, 21.04527698, 69.88149856],
       [ 1.        , 54.73684571, 19.54116158, 67.17409085],
       [ 1.        , 54.46645565, 18.8973496 , 66.01522928],
       [ 1.        , 53.40249575, 31.8879145 , 89.39824609],
       [ 1.        , 49.12533358, 31.59603989, 88.87287181]])

In [32]:
alpha = 1e-3 
M = X_train_augmented.T @ X_train_augmented + alpha*np.eye(X_train_augmented.shape[1])

In [33]:
(np.linalg.inv(M) @ M).round(2)

array([[ 1.,  0.,  0.,  0.],
       [-0.,  1., -0., -0.],
       [ 0., -0.,  1., -0.],
       [-0., -0., -0.,  1.]])

In [34]:
U, S, Vt = np.linalg.svd(X_train_augmented)

In [35]:
theta = np.linalg.inv(X_train_augmented.T @ X_train_augmented \
    + alpha * np.eye(X_train_augmented.shape[1])) @ X_train_augmented.T @ y_train

In [36]:
theta

array([[-0.58586906],
       [ 3.02088126],
       [ 8.89440196],
       [-2.7378939 ]])

In [37]:
model = LinearRegression()

model.fit(X_train, y_train)

In [38]:
model.intercept_, model.coef_

(array([-7.19918514e+11]),
 array([[ 3.02088605e+00, -4.04954164e+10,  2.24974535e+10]]))

In [39]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1e-3)

model.fit(X_train, y_train)

model.intercept_, model.coef_

(array([-142.0790966]), array([[3.02088561, 0.93542313, 1.68376173]]))