In [1]:
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from tqdm import tqdm

[dataset specification](https://scikit-learn.org/1.5/modules/generated/sklearn.datasets.load_iris.html)
## IRIS Dataset
**class:** [setosa, versicolour, virginica]
**Number of Instances**: `150` (50 in each of three classes)
**Number of Attributes**: `4` numeric, predictive attributes and the class
**Attribute Information**: `sepal length`, `sepal width`, `petal length`, `petal width`

In [2]:
iris_set = load_iris()

In [3]:
class Dataset:
  def __init__(self, dataset, indices, transform=None, encoder=None):
    self.dataset, self.indices = dataset, indices
    self.transform, self.encoder = transform, encoder
  def __getitem__(self, item: int):
    idx = self.indices[item]
    feature, label = self.dataset.data[idx], self.dataset.target[idx]
    if self.transform: feature = self.transform(feature)
    if self.encoder: label = self.encoder(label)
    return feature, label
  def __len__(self): return len(self.indices)

In [4]:
indices = random.sample(range(iris_set.data.__len__()), 100)

# init Datasets
trainset = Dataset(iris_set, indices[:50])
testset = Dataset(iris_set, indices[50:])

## Linear Regression
Linear regression is a type of modeling that shows the relationship between explanatory variables and scalar responses. It uses a linear approach called a "linear model". The algorithms that predict parameters must follow a key restriction: their conditional average must be expressed as an affine function. The most common algorithms for linear regression are least squares and Newton's method.
In situations where the algorithm does not properly fit the model, we call it "LOF, Lack of Fitting”, which has led to many optimization techniques and research.

### The Variants of Linear Regression
Linear regression models fall into two distinct categories based on their purpose:
- If the model is used for understanding and analyzing the relationship between explanatory variables and dependent variables, it is called regression analysis.
- If the model is used for prediction and forecasting, it is called a predictive model.

These models can also be classified by their mathematical attributes:
- Simple Linear Regression: A model with a single explanatory variable.
- Multiple Linear Regression: A model with two or more explanatory variables.
- Multivariate Linear Regression: A model with multiple dependent variables.

**Loss Function Definition:**
* $\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$
* $\text{MAE} = \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i|$

In [5]:
# define loss functions(MSE, MAE)
def mean_squared_error(independent, dependent, weight):
  probability = np.dot(independent, weight)
  return np.mean((probability - dependent) ** 2)
# mean_squared_error

def mean_absolute_error(independent, dependent, weight):
  probability = np.dot(independent, weight)
  return np.mean(abs(probability - dependent))
# mean_absolute_error

### Formulation
$$y_i = \beta_{0} + \beta_{1} \cdot x_{(i,1)} + ... + \beta_{p} \cdot x_{(i, p)} + \epsilon_i = \beta_p \cdot x_{(i,p)} + \epsilon_i$$
- $x, y$ represents a vector of observations, which can be a multi-dimensional matrix.
- $\beta$ represents the model parameters, which have a dimension of $p + 1$.
- $\epsilon$ represents possible error.

In [6]:
# define LinearRegression
class LinearRegression:
  def __init__(self, n_inpt): self.weight = np.zeros(shape=(n_inpt))
  def gdr(self, x, y, lr):
    indications = self.forward(x)
    self.weight -= (lr / x.shape[0]) * np.dot(x.T, (indications - y))
  # gdr
  def train(self, dataset, iters: int, lr=0.01):
    for _ in range(iters):
      for feature, label in dataset: self.gdr(feature, label, lr=lr)
  # train
  def forward(self, x): return np.dot(x, self.weight)
# LogisticRegression

### Understanding Learning Rule to Fit the Model using GDR, Gradient Descent Rule

**GDR (Gradient Descent Rule)** is a learning rule and optimization technique for linear regression that helps fit the model to the problem. It minimizes the **Cost Function** by updating weights. This approach has become the fundamental workflow for optimization in modern machine learning and deep learning.

- Initialize weight $\theta$ as $0$ or random number.
- Calculate the relationship between the model and real-world observations using cost function $J(\theta)$.
- Until $J(\theta)$ is fully minimized, the algorithm continues calculating $w' = w - \alpha \cdot \nabla{J(w)}$, where $w'$ is the newly updated weight and $w$ is the previous weight.

In [7]:
def GDR(model, lr):
  def _GDR(x, y):
    pred = model.forward(x)
    model.weight -= lr * np.dot(x.T, (pred - y))
  return _GDR

In [10]:
progress_bar = tqdm(range(10))

# init and train a model
model = LinearRegression(4)
optimizer = GDR(model, 0.001)
for _ in progress_bar:
  loss = 0.
  for feature, label in trainset:
    optimizer(feature, label)
    loss += mean_squared_error(feature, label, model.weight)
  progress_bar.set_postfix(loss=loss/len(trainset))

100%|██████████| 10/10 [00:00<00:00, 424.43it/s, loss=0.0818]


### Key Concepts and Limitations

While deep learning and other advanced machine learning methods have largely superseded linear regression, it remains more cost-effective in certain cases.

- **Exogeneity** is a measurement or property that is not related to the model's error.
    - **Strict Exogeneity:** The model maintains exogeneity over an extended period.
    - **Weak Exogeneity:** The model only maintains exogeneity over the current period.
    - **Deterministic:** The model maintains exogeneity for past periods but not for current and future periods
- **Linearity** means the relationship between parameters and explanatory variables can be measured through linear combinations.
- **Constant Variance** means the model's error range remains independent of the predicted value. For example, if the model predicts an individual's income as 1000, their actual income might range from `800~1200`.
    - **Independence of Errors** means that errors are not correlated with each other. This is one of the major limitations of linear regression, though it can be addressed through data regularization or Bayesian linear regression.

In [12]:
count, n_samples = 0, len(testset)
for feature, label in testset:
  pred = model.forward(feature)
  if round(pred) == label: count += 1
print(f"accuracy: {count / n_samples:.2f}({count}/{n_samples})")

accuracy: 0.88(44/50)
