# Linear Regression Code

In [None]:
import numpy as np
import time

TODO: Make code nicer

In [None]:
class LinearRegression:
    """
    Simple Linear Regression implementation from scratch
    """
    def __init__(self, lr=0.01, num_iters=1000):
        """
        Initialize the linear regression model

        Parameters
        ----------
        lr : int
            learning rate for gradient descen
        y : int
            number of iterations for gradient descent
        """
        self.lr = lr
        self.num_iters = num_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """
        Fit the linear regression model to the input data using gradient descent.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.
        y : array-like, shape (n_samples,)
            Target values.
        """
        n_samples, n_features = X.shape

        # initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for i in range(self.num_iters):
            # make predictions for current weights and bias
            y_predicted = np.dot(X, self.weights) + self.bias

            # calculate gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # update weights and bias using gradients and learning rate
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def fast_fit(self, X, y):
        """
        Fit the linear regression model to the input data using linear algebra.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.
        y : array-like, shape (n_samples,)
            Target values.
        """

        # Compute the coefficient and intercept
        self.weights = np.linalg.inv(X.T @ X) @ X.T @ y
        self.bias = np.mean(y) - np.mean(X, axis=0) @ self.weights

    def predict(self, X):
        """
        Predict target values for input data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Input data.

        Returns
        -------
        y_pred : array-like, shape (n_samples,)
            Predicted target values.
        """
        y_predicted = np.dot(X, self.weights) + self.bias
        return y_predicted

In [None]:
# create some toy data
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array([3, 7, 11])

In [None]:
# create an instance of the linear regression model


In [None]:
start = time.time()
# train the model on the data

print(f"{time.time() - start} seconds")
print(lr.weights, lr.bias)

0.23621892929077148 seconds
[0.33333333 0.44444444 0.55555556] 0.11111111111111449


In [None]:
# Create another model with a learning rate of 0.01

In [None]:
start = time.time()
# train the model on the data

print(f"{time.time() - start} seconds")
print(lr2.weights, lr2.bias)

0.0010671615600585938 seconds
[-0.671875  0.        1.265625] 2.09375


In [None]:
# make predictions on new data
X_new = np.array([[10, 11, 12], [13, 14, 15]])


[15.00030711 19.00049755]


# Linear Regression Math Explanation

Given a linear equation $y_{predicted} = Xw + b$ that models the relationship between the input features $X$ and the target values $y$, we want to find the values of $w$ and $b$ that minimize the mean squared error (MSE) loss function:

$$L(w,b) = \frac{1}{m} \sum_{i=1}^m (y_{predicted}^{(i)} - y^{(i)})^2$$

where $m$ is the number of samples in the dataset.

To minimize the loss function, we can use gradient descent to iteratively update the values of $w$ and $b$ in the direction of the negative gradient of the loss function with respect to these parameters. The update rule for each iteration of gradient descent is given by:

$$w := w - \alpha \frac{\partial L}{\partial w}$$

$$b := b - \alpha \frac{\partial L}{\partial b}$$

where $\alpha$ is the learning rate.



To compute the partial derivatives, we can use the chain rule:

$$\frac{\partial L}{\partial w} = \frac{\partial L}{\partial y_{predicted}} \frac{\partial y_{predicted}}{\partial w}$$

$$\frac{\partial L}{\partial b} = \frac{\partial L}{\partial y_{predicted}} \frac{\partial y_{predicted}}{\partial b}$$

where

$$\frac{\partial L}{\partial y_{predicted}} = \frac{2}{m} (y_{predicted} - y)$$

is the derivative of the MSE loss function with respect to $y_{predicted}$.

To compute the derivatives of $y_{predicted}$ with respect to $w$ and $b$, we can take the partial derivatives of the linear equation:

$$\frac{\partial y_{predicted}}{\partial w} = X$$

$$\frac{\partial y_{predicted}}{\partial b} = 1$$



Substituting these expressions into the chain rule gives us:

$$\frac{\partial L}{\partial w} = \frac{2}{m} X^T (Xw + b - y)$$

$$\frac{\partial L}{\partial b} = \frac{2}{m} \sum_{i=1}^m (y_{predicted}^{(i)} - y^{(i)})$$

Simplifying these expressions gives us the final formulas for dw and db used in the code:

$$dw = \frac{1}{m} X^T(y_{predicted} - y)$$

$$db = \frac{1}{m} \sum_{i=1}^m (y_{predicted}^{(i)} - y^{(i)})$$

where $m$ is the number of samples, $X$ is the matrix of input features, $y$ is the vector of target values, and $y_{predicted}$ is the vector of predicted values given by the linear equation $y_{predicted} = Xw + b$.