<a href="https://colab.research.google.com/github/joaomarcosmb/ml-ufc/blob/main/lista-02/AMA_Lista_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Métodos utilitários

In [83]:
def _add_bias(X):
  return np.hstack((np.ones((X.shape[0], 1)), X))

In [84]:
def evaluate_fold(y_true, y_pred, classes):
  acc_global = np.mean(y_pred == y_true)

  per_class = []
  for c in classes:
      mask = (y_true == c)
      per_class.append(np.mean(y_pred[mask] == c))
  return acc_global, np.array(per_class, dtype=float)

In [85]:
def summarize_cv_metrics(accs_global, accs_per_class_list, classes):
  accs_global = np.array(accs_global, dtype=float)
  acc_mean, acc_std = np.mean(accs_global), np.std(accs_global)
  print(f'Valor médio da acurácia global: {acc_mean:.4f}')
  print(f'Desvio padrão da acurácia global: {acc_std:.4f}')

  # Empilha [n_folds, n_classes]
  A = np.vstack(accs_per_class_list)
  per_cls_mean = np.mean(A, axis=0)
  per_cls_std = np.std(A, axis=0)

  for c, mean, std in zip(classes, per_cls_mean, per_cls_std):
    print(f'Acurácia média da classe {c}: {mean:.4f}')
    print(f'Desvio padrão da acurácia da classe {c}: {std:.4f}')

# Questão 1

In [86]:
df_bc = pd.read_csv('data/breastcancer.csv')
df_bc

Unnamed: 0,1.798999999999999844e+01,1.038000000000000078e+01,1.227999999999999972e+02,1.001000000000000000e+03,1.184000000000000052e-01,2.776000000000000134e-01,3.000999999999999779e-01,1.471000000000000085e-01,2.419000000000000039e-01,7.871000000000000218e-02,...,1.732999999999999829e+01,1.845999999999999943e+02,2.019000000000000000e+03,1.622000000000000108e-01,6.655999999999999694e-01,7.118999999999999773e-01,2.654000000000000248e-01,4.601000000000000090e-01,1.189000000000000057e-01,1.000000000000000000e+00
0,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,1.0
1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,1.0
2,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,1.0
3,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,1.0
4,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,0.07613,...,23.75,103.40,741.6,0.17910,0.52490,0.5355,0.1741,0.3985,0.12440,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,1.0
564,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,1.0
565,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,1.0
566,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,1.0


In [87]:
X_original_bc = df_bc.iloc[:, 1:-1].to_numpy(dtype=float)
y_original_bc = df_bc.iloc[:, -1].to_numpy()

### Implementação da regressão logística

Um modelo de regressão logística é definido como

$$\hat{y}_i = \sigma(\boldsymbol{w}^\top \boldsymbol{x}_i).$$

Por sua vez, a função sigmóide $\sigma(z)$ é calculada como
$$\sigma(z) = \frac{1}{1 + \text{exp}(-z)}.$$

Considerando a função de otimização como a de Gradiente Descendente Estocástico (SGD), temos que a atualização dos parâmetros para uma única amostra selecionada aleatoriamente é dada por:
$$\boldsymbol{w} \gets \boldsymbol{w} - \alpha(y_i - \hat{y}_i)\boldsymbol{x}_i$$
De forma que o erro $e_i = y_i - \hat{y}_i$.

In [88]:
class LogisticRegressionSGD:

  def _sigmoid(self, z):
    return 1 / (1 + np.exp(-z))


  def fit(self, X, y, learning_rate=0.01, n_iterations=1000, lambda_=1e-3):
    # Adiciona o termo de viés
    X = _add_bias(X)

    N, D = X.shape

    # Inicializa os parâmetros bem pequenos
    self.w = np.random.randn(D) * 0.01

    # Cria uma máscara de regularização e exclui o termo de viés
    reg_mask = np.ones_like(self.w)
    reg_mask[0] = 0.0

    for _ in range(n_iterations):
      # Seleciona uma amostra aleatória
      i = np.random.randint(N)
      xi = X[i]
      yi = y[i]

      # Computa predição
      y_hat = self._sigmoid(np.dot(xi, self.w))

      # Calcula o gradiente estocástico
      grad = (y_hat - yi) * xi + lambda_ * reg_mask * self.w

      # Atualiza os pesos
      self.w -= learning_rate * grad


  def predict(self, X):
    X = _add_bias(X)
    return (self._sigmoid(np.dot(X, self.w)) >= 0.5).astype(int)

### Implementação do algoritmo de ADG

Considerando um problema de classificação binária, o modelo de ADG pode ser descrito como:
$$p(\boldsymbol{x}|C_k)=\mathcal{N}(\boldsymbol{x} \mid \boldsymbol{\mu}_k,\boldsymbol{\Sigma}_k), \forall k \in \{1, 2\}$$

Onde, para cada classe:
* O vetor de médias $\hat{\boldsymbol{\mu}}_k$ é dado por
$$\hat{\boldsymbol{\mu}}_k = \frac{1}{N_k} \sum_{\boldsymbol{x}_i \in C_k} \boldsymbol{x}_i.$$
* A matriz de covariância $\hat{\boldsymbol{\Sigma}}_k$ é tida por
$$\hat{\boldsymbol{\Sigma}}_k = \frac{1}{N_k-1} \sum_{\boldsymbol{x}_i \in C_k} (\boldsymbol{x}_i - \hat{\boldsymbol{\mu}}_k)(\boldsymbol{x}_i - \hat{\boldsymbol{\mu}}_k)^\top.$$

Para este caso, vamos considerar o cáculo empírico das prioris, isto é, a partir dos dados que temos, logo
$$p(C_k) = \frac{N_k}{N}.$$

Para cada novo padrão $\boldsymbol{x}_*$, deve-se escolher a classe com maior log-posteriori, computando, para cada uma,
$$\log(p(C_k | \boldsymbol{x}_*)) \propto -\frac{1}{2}\log|\hat{\boldsymbol{\Sigma}}_k| -\frac{1}{2}(\boldsymbol{x}_*-\boldsymbol{\mu}_k)^\top \hat{\boldsymbol{\Sigma}}_k^{-1}(\boldsymbol{x}_*-\boldsymbol{\mu}_k) + \log p(C_k)$$

In [89]:
class GaussianDiscriminant:

  def fit(self, X, y):
    self.classes = np.unique(y)

    N, D = X.shape

    # Médias por classe
    self.means = {}
    # Covariâncias por classe
    self.covs = {}
    # Contagem de elementos por classe
    counts = {}

    # Prioris
    self.priors = {}

    for c in self.classes:
      # Seleciona os dados da classe C
      X_c = X[y == c]
      # Cacula a média deles
      self.means[c] = np.mean(X_c, axis=0)
      # Faz a contagem de elementos
      counts[c] = len(X_c)
      # Computa a matriz de covariância
      Sigma = (X_c - self.means[c]).T @ (X_c - self.means[c]) / (counts[c] - 1)
      self.covs[c] = Sigma

      self.priors[c] = counts[c] / N


  def _log_gaussian(self, X, mu, Sigma):
    return -0.5 * np.log(np.linalg.det(Sigma)) - 0.5 * (X - mu) @ np.linalg.inv(Sigma) @ (X - mu).T


  def predict(self, X):
    # Inicializa um vetor vazio de predições
    preds = []

    for x in X:
      # Scores de cada classe
      scores = {}

      for k in self.classes:
        log_prior = np.log(self.priors[k])
        log_likelihood = self._log_gaussian(x, self.means[k], self.covs[k])
        scores[k] = log_prior + log_likelihood

      # Escolhe a classe com maior score
      preds.append(max(scores, key=scores.get))

    return np.array(preds)

### Implementação do Naive Bayes Gaussiano

O algoritmo de Naive Bayes Gaussiano considera distribuições Gaussianas para $p(\boldsymbol{x}_d|C_k)$:
$$p(C_k|\boldsymbol{x}) \propto p(C_k) \prod_{d = 1}^{D}\mathcal{N}(\boldsymbol{x}_d|\boldsymbol{\mu}_{dk}, \sigma^2_{dk}), \forall k.$$

Para cada novo padrão $\boldsymbol{x}_*$,
$$\hat{y}_* = \arg \max_{k} \left[ \log p(C_k) - \frac{1}{2} \sum^D_{d = 1} \log 2\pi\sigma^2_{dk} - \frac{1}{2} \sum^D_{d = 1} \frac{(\boldsymbol{x}_{*d} - \boldsymbol{\mu}_{dk})^2}{\sigma_{dk}^2} \right].$$

In [90]:
class GaussianNB:

  def fit(self, X, y):
    X = np.asarray(X, dtype=float)
    y = np.asarray(y)
    self.classes = np.unique(y)

    self.means = {}
    # Variâncias por classe
    self.vars = {}

    self.priors = {}

    for c in self.classes:
      X_c = X[y == c]
      self.means[c] = np.mean(X_c, axis=0)
      self.vars[c] = np.var(X_c, axis=0)
      self.priors[c] = len(X_c) / len(X)


  def predict(self, X):
    X = np.asarray(X, dtype=float)
    if X.ndim == 1:
        X = X.reshape(1, -1)

    preds = []

    for x in X:
      scores = {}

      for c in self.classes:
        prior = np.log(self.priors[c])
        likelihood = - 0.5 * np.sum(np.log(2 * np.pi * self.vars[c])) - 0.5 * np.sum((x - self.means[c]) ** 2 / self.vars[c])
        scores[c] = prior + likelihood

      preds.append(max(scores, key=scores.get))

    return np.array(preds, dtype=self.classes.dtype)

### Implementação do K-Fold Cross Validation

Para tarefas de classificação binária, o versão estratificada do K-Fold é recomendada, uma vez que ela leva em consideração as classes para evitar a criação de partições desbalanceadas.

O algoritmo contém os seguintes passos:
1. Separar os índices das amostras por classe.
2. Embaralhar os índices de cada classe separadamente.
3. Dividir os índices de cada classe em $K$ partes aproximadamente iguais.
4. Para cada fold:
    * O fold de teste é formado pegando uma parte de cada classe.
    * O fold de treino é formado com as $K - 1$ partições restantes.

In [91]:
def stratified_kfold(X, y, k=10, seed=None):
  np.random.seed(seed)

  X = np.asarray(X)
  y = np.asarray(y)

  classes = np.unique(y)

  folds_per_class = []

  for c in classes:
    # Separa os índices da classe da iteração atual
    idx = np.where(y == c)[0]
    # Embaralha eles
    np.random.shuffle(idx)
    # Divide em K partes
    folds_per_class.append(np.array_split(idx, k))

  for i in range(k):
    # Inicia pegando os índices do primeiro fold como teste
    test_idx = np.concatenate([folds[i] for folds in folds_per_class])
    # O restante é usado como treino
    train_idx = np.concatenate([
            np.concatenate(folds[:i] + folds[i+1:]) for folds in folds_per_class
    ])
    yield train_idx, test_idx

In [92]:
scaler = StandardScaler()

In [93]:
classes = np.unique(y_original_bc)

accs_lr, accs_adg, accs_nb = [], [], []
accs_lr_per_class, accs_adg_per_class, accs_nb_per_class = [], [], []

for train_idx, test_idx in stratified_kfold(X_original_bc, y_original_bc, k=10, seed=42):
  X_train, y_train = X_original_bc[train_idx], y_original_bc[train_idx]
  X_test, y_test = X_original_bc[test_idx], y_original_bc[test_idx]

  # Normalização dos dados
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Treinamento do modelo de Regressão Logística
  lr = LogisticRegressionSGD()
  lr.fit(X_train_scaled, y_train)
  y_pred_lr = lr.predict(X_test_scaled)

  acc, per_cls = evaluate_fold(y_test, y_pred_lr, classes)
  accs_lr.append(acc)
  accs_lr_per_class.append(per_cls)

  # Treinamento do modelo de ADG
  adg = GaussianDiscriminant()
  adg.fit(X_train_scaled, y_train)
  y_pred_adg = adg.predict(X_test_scaled)

  acc, per_cls = evaluate_fold(y_test, y_pred_adg, classes)
  accs_adg.append(acc)
  accs_adg_per_class.append(per_cls)

  # Treinamento do modelo de Naive Bayes
  nb = GaussianNB()
  nb.fit(X_train_scaled, y_train)
  y_pred_nb = nb.predict(X_test_scaled)

  acc, per_cls = evaluate_fold(y_test, y_pred_nb, classes)
  accs_nb.append(acc)
  accs_nb_per_class.append(per_cls)

In [94]:
print('======= Avaliação do modelo de Regressão Logística =======')
summarize_cv_metrics(accs_lr, accs_lr_per_class, classes)
print('\n======= Avaliação do modelo de ADG =======')
summarize_cv_metrics(accs_adg, accs_adg_per_class, classes)
print('\n======= Avaliação do modelo de Naive Bayes =======')
summarize_cv_metrics(accs_nb, accs_nb_per_class, classes)

Valor médio da acurácia global: 0.9754
Desvio padrão da acurácia global: 0.0160
Acurácia média da classe 0.0: 0.9889
Desvio padrão da acurácia da classe 0.0: 0.0184
Acurácia média da classe 1.0: 0.9526
Desvio padrão da acurácia da classe 1.0: 0.0369

Valor médio da acurácia global: 0.9490
Desvio padrão da acurácia global: 0.0298
Acurácia média da classe 0.0: 0.9609
Desvio padrão da acurácia da classe 0.0: 0.0255
Acurácia média da classe 1.0: 0.9288
Desvio padrão da acurácia da classe 1.0: 0.0574

Valor médio da acurácia global: 0.9262
Desvio padrão da acurácia global: 0.0405
Acurácia média da classe 0.0: 0.9470
Desvio padrão da acurácia da classe 0.0: 0.0339
Acurácia média da classe 1.0: 0.8905
Desvio padrão da acurácia da classe 1.0: 0.0879


# Questão 2

In [95]:
df_v = pd.read_csv('data/vehicle.csv')
df_v

Unnamed: 0,9.500000000000000000e+01,4.800000000000000000e+01,8.300000000000000000e+01,1.780000000000000000e+02,7.200000000000000000e+01,1.000000000000000000e+01,1.620000000000000000e+02,4.200000000000000000e+01,2.000000000000000000e+01,1.590000000000000000e+02,1.760000000000000000e+02,3.790000000000000000e+02,1.840000000000000000e+02,7.000000000000000000e+01,6.000000000000000000e+00,1.600000000000000000e+01,1.870000000000000000e+02,1.970000000000000000e+02,3.000000000000000000e+00
0,91.0,41.0,84.0,141.0,57.0,9.0,149.0,45.0,19.0,143.0,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199.0,3.0
1,104.0,50.0,106.0,209.0,66.0,10.0,207.0,32.0,23.0,158.0,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196.0,2.0
2,93.0,41.0,82.0,159.0,63.0,9.0,144.0,46.0,19.0,143.0,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207.0,3.0
3,85.0,44.0,70.0,205.0,103.0,52.0,149.0,45.0,19.0,144.0,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183.0,0.0
4,107.0,57.0,106.0,172.0,50.0,6.0,255.0,26.0,28.0,169.0,280.0,957.0,264.0,85.0,5.0,9.0,181.0,183.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
840,93.0,39.0,87.0,183.0,64.0,8.0,169.0,40.0,20.0,134.0,200.0,422.0,149.0,72.0,7.0,25.0,188.0,195.0,2.0
841,89.0,46.0,84.0,163.0,66.0,11.0,159.0,43.0,20.0,159.0,173.0,368.0,176.0,72.0,1.0,20.0,186.0,197.0,3.0
842,106.0,54.0,101.0,222.0,67.0,12.0,222.0,30.0,25.0,173.0,228.0,721.0,200.0,70.0,3.0,4.0,187.0,201.0,2.0
843,86.0,36.0,78.0,146.0,58.0,7.0,135.0,50.0,18.0,124.0,155.0,270.0,148.0,66.0,0.0,25.0,190.0,195.0,2.0


In [96]:
X_original_v = df_v.iloc[:, 1:-1].to_numpy(dtype=float)
y_original_v = df_v.iloc[:, -1].to_numpy()

### Implementação da Regressão Softmax

Para o caso multiclasse, considerando uma coluna $\boldsymbol{w}_k$ da matriz de parâmetros $\boldsymbol{W}$ associada à classe $k$, a saída da regressão logística multinomial é:
$$\hat{y}_{ik} = \frac{\exp(\boldsymbol{w}_k^\top \boldsymbol{x}_i)}{\sum_{j = 1}^K \exp(\boldsymbol{w}_j^\top \boldsymbol{x}_i)}, \ 1 \le k \le K.$$

A regra de atualização para o SGD pode ser computada como:
$$\boldsymbol{w}_k \gets \boldsymbol{w}_k + \alpha e_{ik} \boldsymbol{x}_i$$

Onde
$$e_{ik} = y_{ik} - \hat{y}_{ik} = y_{ik} - \frac{\exp(\boldsymbol{w}_k^\top \boldsymbol{x}_i)}{\sum_{j = 1}^K \exp(\boldsymbol{w}_j^\top \boldsymbol{x}_i)}.$$

In [97]:
class SoftmaxRegressionSGD:

  def _softmax(self, z):
    z = z - np.max(z, axis=-1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=-1, keepdims=True)


  def fit(self, X, y, learning_rate=0.01, n_iterations=1000, lambda_=1e-3):
    # Adiciona o termo de viés
    X = _add_bias(X)

    N, D = X.shape
    self.classes_ = np.unique(y)
    n_classes = len(self.classes_)
    y_idx = np.searchsorted(self.classes_, y)

    # Inicializa os parâmetros bem pequenos
    self.w = np.random.randn(D, n_classes) * 0.01    # (D, K)

    # Cria uma máscara de regularização e exclui o termo de viés
    reg_mask = np.ones_like(self.w)
    reg_mask[0] = 0.0

    for _ in range(n_iterations):
      # Seleciona uma amostra aleatória
      i = np.random.randint(N)
      xi = X[i]                                      # (D,)
      yi = y_idx[i]

      logits = np.dot(xi, self.w)                    # (K,)
      # Computa predição
      y_hat = self._softmax(logits)                  # (K,)

      # One-Hot Enconding
      y_onehot = np.zeros(n_classes)                 # (K,)
      y_onehot[yi] = 1

      # Calcula o gradiente estocástico
      grad = np.outer(xi, (y_hat - y_onehot)) + lambda_ * reg_mask * self.w

      # Atualiza os pesos
      self.w -= learning_rate * grad


  def predict(self, X):
    X = _add_bias(X)
    logits = np.dot(X, self.w)
    probs = self._softmax(logits)
    idx = np.argmax(probs, axis=1)
    return self.classes_[idx]

In [98]:
classes = np.unique(y_original_v)

accs_sm, accs_adg, accs_nb = [], [], []
accs_sm_per_class, accs_adg_per_class, accs_nb_per_class = [], [], []

for train_idx, test_idx in stratified_kfold(X_original_v, y_original_v, k=10, seed=42):
  X_train, y_train = X_original_v[train_idx], y_original_v[train_idx]
  X_test, y_test = X_original_v[test_idx], y_original_v[test_idx]

  # Normalização dos dados
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Treinamento do modelo de Regressão Softmax
  sm = SoftmaxRegressionSGD()
  sm.fit(X_train_scaled, y_train)
  y_pred_sm = sm.predict(X_test_scaled)

  acc, per_cls = evaluate_fold(y_test, y_pred_sm, classes)
  accs_sm.append(acc)
  accs_sm_per_class.append(per_cls)

  # Treinamento do modelo de ADG
  adg = GaussianDiscriminant()
  adg.fit(X_train_scaled, y_train)
  y_pred_adg = adg.predict(X_test_scaled)

  acc, per_cls = evaluate_fold(y_test, y_pred_adg, classes)
  accs_adg.append(acc)
  accs_adg_per_class.append(per_cls)

  # Treinamento do modelo de Naive Bayes
  nb = GaussianNB()
  nb.fit(X_train_scaled, y_train)
  y_pred_nb = nb.predict(X_test_scaled)

  acc, per_cls = evaluate_fold(y_test, y_pred_nb, classes)
  accs_nb.append(acc)
  accs_nb_per_class.append(per_cls)

In [99]:
print('======= Avaliação do modelo de Regressão Softmax =======')
summarize_cv_metrics(accs_sm, accs_sm_per_class, classes)
print('\n======= Avaliação do modelo de ADG =======')
summarize_cv_metrics(accs_adg, accs_adg_per_class, classes)
print('\n======= Avaliação do modelo de Naive Bayes =======')
summarize_cv_metrics(accs_nb, accs_nb_per_class, classes)

Valor médio da acurácia global: 0.6330
Desvio padrão da acurácia global: 0.0496
Acurácia média da classe 0.0: 0.8245
Desvio padrão da acurácia da classe 0.0: 0.1335
Acurácia média da classe 1.0: 0.3152
Desvio padrão da acurácia da classe 1.0: 0.2078
Acurácia média da classe 2.0: 0.4569
Desvio padrão da acurácia da classe 2.0: 0.1777
Acurácia média da classe 3.0: 0.9542
Desvio padrão da acurácia da classe 3.0: 0.0359

Valor médio da acurácia global: 0.8402
Desvio padrão da acurácia global: 0.0302
Acurácia média da classe 0.0: 0.9818
Desvio padrão da acurácia da classe 0.0: 0.0223
Acurácia média da classe 1.0: 0.7314
Desvio padrão da acurácia da classe 1.0: 0.0548
Acurácia média da classe 2.0: 0.6734
Desvio padrão da acurácia da classe 2.0: 0.0993
Acurácia média da classe 3.0: 0.9847
Desvio padrão da acurácia da classe 3.0: 0.0233

Valor médio da acurácia global: 0.4450
Desvio padrão da acurácia global: 0.0310
Acurácia média da classe 0.0: 0.1742
Desvio padrão da acurácia da classe 0.0: 