<a href="https://colab.research.google.com/github/joaomarcosmb/ml-ufc/blob/main/lista-03/AMA_Lista_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('kc2.csv')

In [3]:
df.head()

Unnamed: 0,4.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00.1,1.000000000000000000e+00.2,4.000000000000000000e+00.1,8.000000000000000000e+00,6.700000000000000400e-01,1.500000000000000000e+00,5.330000000000000071e+00,1.200000000000000000e+01,...,2.000000000000000000e+00,0.000000000000000000e+00.1,0.000000000000000000e+00.2,0.000000000000000000e+00.3,3.000000000000000000e+00,1.000000000000000000e+00.3,3.000000000000000000e+00.1,1.000000000000000000e+00.4,1.000000000000000000e+00.5,0.000000000000000000e+00.4
0,39.0,4.0,1.0,2.0,105.0,520.19,0.07,13.89,37.44,7227.91,...,29.0,1.0,4.0,2.0,12.0,19.0,61.0,44.0,7.0,0.0
1,1.0,1.0,1.0,1.0,6.0,15.51,0.4,2.5,6.2,38.77,...,0.0,0.0,0.0,0.0,5.0,1.0,5.0,1.0,1.0,0.0
2,15.0,1.0,1.0,1.0,55.0,224.81,0.17,5.73,39.25,1287.55,...,12.0,0.0,1.0,0.0,6.0,11.0,34.0,21.0,1.0,0.0
3,12.0,2.0,1.0,2.0,15.0,45.0,0.17,6.0,7.5,270.0,...,8.0,1.0,0.0,0.0,6.0,2.0,11.0,4.0,3.0,0.0
4,8.0,1.0,1.0,1.0,13.0,43.19,0.27,3.75,11.52,161.94,...,6.0,0.0,0.0,0.0,6.0,4.0,8.0,5.0,1.0,0.0


In [4]:
X_original = df.iloc[:, :-1].to_numpy(dtype=float)
y_original = df.iloc[:, -1].to_numpy(dtype=int)

In [5]:
def confusion_metrics(y_true, y_pred):
  # Assume que 1 é a classe positiva
  TP = np.sum((y_true == 1) & (y_pred == 1))
  TN = np.sum((y_true == 0) & (y_pred == 0))
  FP = np.sum((y_true == 0) & (y_pred == 1))
  FN = np.sum((y_true == 1) & (y_pred == 0))

  return TP, TN, FP, FN

$$\text{Accuracy} = \frac{\text{TP}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}$$

$$\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}$$

$$\text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}$$

$$\text{F1-score} = 2 \times \frac{\text{recall} \times \text{precision}}{\text{recall} + \text{precision}}$$

In [6]:
def calc_metrics(y_true, y_pred):
  TP, TN, FP, FN = confusion_metrics(y_test, y_pred)
  total = TP + TN + FP + FN

  accuracy = (TP + TN) / total if total > 0 else 0.0
  recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
  precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
  f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

  return accuracy, recall, precision, f1_score

In [7]:
def stratified_kfold(X, y, k=10, seed=None):
  np.random.seed(seed)

  X = np.asarray(X)
  y = np.asarray(y)

  classes = np.unique(y)

  folds_per_class = []

  for c in classes:
    # Separa os índices da classe da iteração atual
    idx = np.where(y == c)[0]
    # Embaralha eles
    np.random.shuffle(idx)
    # Divide em K partes
    folds_per_class.append(np.array_split(idx, k))

  for i in range(k):
    # Inicia pegando os índices do primeiro fold como teste
    test_idx = np.concatenate([folds[i] for folds in folds_per_class])
    # O restante é usado como treino
    train_idx = np.concatenate([
            np.concatenate(folds[:i] + folds[i+1:]) for folds in folds_per_class
    ])
    yield train_idx, test_idx

Para calcular a distância euclidiana entre dois padrões $\boldsymbol{x}_i$ e $\boldsymbol{x}_j$, calculamos:
$$d(\boldsymbol{x}_i, \boldsymbol{x}_j) = \left\| \boldsymbol{x}_i - \boldsymbol{x}_j \right\|_2 = \sqrt{\sum_{d = 1}^{D}(x_{id} - x_{jd})^{2}}.$$

Já a distância de Mahalonobis pode ser computada como
$$d_M(\boldsymbol{x}_i, \boldsymbol{x}_j) = \sqrt{(\boldsymbol{x}_i - \boldsymbol{x}_j)^\top \boldsymbol{\Sigma}^{-1} (\boldsymbol{x}_i - \boldsymbol{x}_j)},$$
Onde $\boldsymbol{\Sigma}$ é a matriz de covariância dos dados de treinamento.

In [8]:
class KNN:
  def __init__(self, k=5, metric="euclidean"):
    self.k = k
    self.metric = metric
    self.X_train = None
    self.y_train = None
    self.cov_inv = None
    self.metrics = {
      "acc": [],
      "rec": [],
      "prec": [],
      "f1": []
    }

  def _euclidean(self, x):
    # x tem shape (D,)
    diffs = self.X_train - x          # (N, D)
    return np.sqrt(np.sum(diffs**2, axis=1))

  def _mahalonobis(self, x):
    diffs = self.X_train - x          # (N, D): cada linha é (x_i - x)
    left = diffs @ self.cov_inv       # (N, D)
    # left * diffs faz o broadcasting (produto elemento a elemento)
    # e o sum soma ao longo das colunas, retornando um vetor (N,)
    return np.sqrt(np.sum(left * diffs, axis=1))

  def fit(self, X, y):
    self.X_train = np.asarray(X)
    self.y_train = np.asarray(y)

    if self.metric == 'euclidean':
      self.cov_inv = None
      self.distance = self._euclidean
    else:
      self.cov_inv = np.linalg.inv(np.cov(self.X_train, rowvar=False)) # rowvar=False indica cada coluna é uma variável
      self.distance = self._mahalonobis

  def _distances_to_point(self, x):
    return self.distance(x)

  def _predict_point(self, x):
    # Calcula as distâncias do ponto até todos os outros pontos no conjunto de treino
    dists = self._distances_to_point(x)

    # Seleciona os índices dos k vizinhos mais próximos
    k_idx = np.argsort(dists)[:self.k]
    k_labels = self.y_train[k_idx]

    # Conta quantas vezes cada classe aparece
    classes, counts = np.unique(k_labels, return_counts=True)

    # Se obtivermos uma única classe no ranking, retornamos ela
    if len(classes) == 1:
      return classes[0]

    # Verifica se houve empate
    max_count = np.max(counts)
    ties = classes[counts == max_count]
    if len(ties) == 1:
      return ties[0]

    # Se houve empate, escolhemos a classe do vizinho mais próximo
    for idx in k_idx:
      if self.y_train[idx] in ties:
        return self.y_train[idx]

    # Fallback
    return classes[0]

  def predict(self, X):
    X = np.asarray(X)
    preds = [self._predict_point(x) for x in X]
    return np.array(preds, dtype=int)

  def calc_metrics(self, y_true, y_pred):
    acc, rec, prec, f1 = calc_metrics(y_true, y_pred)
    self.metrics["acc"].append(acc)
    self.metrics["rec"].append(rec)
    self.metrics["prec"].append(prec)
    self.metrics["f1"].append(f1)

In [9]:
class DecisionTree:
  def __init__(self, criterion="gini", min_samples_leaf=3, max_features='sqrt', random_state=42):
    self.criterion = criterion
    self.clf = DecisionTreeClassifier(criterion=criterion,
                                      min_samples_leaf=min_samples_leaf,
                                      max_features=max_features,
                                      random_state=random_state)
    self.metrics = {
      "acc": [],
      "rec": [],
      "prec": [],
      "f1": []
    }

  def fit(self, X, y):
    self.clf.fit(X, y)

  def predict(self, X):
    return self.clf.predict(X)

  def calc_metrics(self, y_true, y_pred):
    acc, rec, prec, f1 = calc_metrics(y_true, y_pred)
    self.metrics["acc"].append(acc)
    self.metrics["rec"].append(rec)
    self.metrics["prec"].append(prec)
    self.metrics["f1"].append(f1)

In [10]:
knn_k1_euclidean = KNN(k=1, metric="euclidean")
knn_k1_mahalonobis = KNN(k=1, metric="mahalonobis")
knn_k5_euclidean = KNN(k=5, metric="euclidean")
knn_k5_mahalonobis = KNN(k=5, metric="mahalonobis")

In [11]:
dt_gini = DecisionTree(criterion="gini")
dt_entropy = DecisionTree(criterion="entropy")

In [12]:
scaler = StandardScaler()

In [13]:
for train_idx, test_idx in stratified_kfold(X_original, y_original, k=10, seed=42):
  X_train, y_train = X_original[train_idx], y_original[train_idx]
  X_test, y_test = X_original[test_idx], y_original[test_idx]

  # Normalização dos dados
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Treinamento do KNN
  knn_k1_euclidean.fit(X_train_scaled, y_train)
  knn_k1_mahalonobis.fit(X_train_scaled, y_train)
  knn_k5_euclidean.fit(X_train_scaled, y_train)
  knn_k5_mahalonobis.fit(X_train_scaled, y_train)

  # Predição
  y_pred_k1e = knn_k1_euclidean.predict(X_test_scaled)
  y_pred_k1m = knn_k1_mahalonobis.predict(X_test_scaled)
  y_pred_k5e = knn_k5_euclidean.predict(X_test_scaled)
  y_pred_k5m = knn_k5_mahalonobis.predict(X_test_scaled)

  # Salva as métricas do fold
  knn_k1_euclidean.calc_metrics(y_test, y_pred_k1e)
  knn_k1_mahalonobis.calc_metrics(y_test, y_pred_k1m)
  knn_k5_euclidean.calc_metrics(y_test, y_pred_k5e)
  knn_k5_mahalonobis.calc_metrics(y_test, y_pred_k5m)

  # Treinamento da árvore de decisão
  dt_gini.fit(X_train, y_train)
  dt_entropy.fit(X_train, y_train)

  # Predição
  y_pred_gini = dt_gini.predict(X_test)
  y_pred_entropy = dt_entropy.predict(X_test)

  # Salva as métricas do fold
  dt_gini.calc_metrics(y_test, y_pred_gini)
  dt_entropy.calc_metrics(y_test, y_pred_entropy)

In [14]:
def mean_std(values):
  arr = np.asarray(values, dtype=float)
  mean = np.mean(arr)
  std = np.std(arr, ddof=1) if len(arr) > 1 else 0.0 # ddof=1 é para o cálculo amostral
  return mean, std

In [15]:
def print_results(model_name, metrics_dict):
  acc_mean, acc_std = mean_std(metrics_dict["acc"])
  rec_mean, rec_std = mean_std(metrics_dict["rec"])
  prec_mean, prec_std = mean_std(metrics_dict["prec"])
  f1_mean, f1_std = mean_std(metrics_dict["f1"])

  print(f"\n===== {model_name} =====")
  print(f"Acurácia:  {acc_mean*100:.2f}% ± {acc_std*100:.2f}%")
  print(f"Recall:    {rec_mean*100:.2f}% ± {rec_std*100:.2f}%")
  print(f"Precisão:  {prec_mean*100:.2f}% ± {prec_std*100:.2f}%")
  print(f"F1-score:  {f1_mean*100:.2f}% ± {f1_std*100:.2f}%")

In [16]:
print_results("KNN k=1 (Euclidiana)", knn_k1_euclidean.metrics)
print_results("KNN k=1 (mahalonobis)", knn_k1_mahalonobis.metrics)
print_results("KNN k=5 (Euclidiana)", knn_k5_euclidean.metrics)
print_results("KNN k=5 (mahalonobis)", knn_k5_mahalonobis.metrics)
print_results("Árvore de Decisão (Gini)", dt_gini.metrics)
print_results("Árvore de Decisão (Entropia)", dt_entropy.metrics)


===== KNN k=1 (Euclidiana) =====
Acurácia:  76.26% ± 8.21%
Recall:    73.09% ± 13.94%
Precisão:  79.49% ± 11.70%
F1-score:  75.24% ± 9.32%

===== KNN k=1 (mahalonobis) =====
Acurácia:  71.46% ± 11.07%
Recall:    65.64% ± 12.55%
Precisão:  77.89% ± 18.35%
F1-score:  69.96% ± 10.59%

===== KNN k=5 (Euclidiana) =====
Acurácia:  80.30% ± 6.87%
Recall:    81.45% ± 7.20%
Precisão:  81.73% ± 11.88%
F1-score:  80.86% ± 5.26%

===== KNN k=5 (mahalonobis) =====
Acurácia:  74.26% ± 5.94%
Recall:    64.55% ± 12.08%
Precisão:  82.61% ± 11.91%
F1-score:  71.21% ± 7.34%

===== Árvore de Decisão (Gini) =====
Acurácia:  73.74% ± 7.28%
Recall:    65.45% ± 14.76%
Precisão:  82.80% ± 14.49%
F1-score:  71.12% ± 7.95%

===== Árvore de Decisão (Entropia) =====
Acurácia:  71.90% ± 8.50%
Recall:    66.27% ± 11.72%
Precisão:  78.99% ± 15.00%
F1-score:  70.51% ± 6.30%
