## Projeto 02 - Árvores de Decisão
**Estudante:** João Rodrigues

### Importando recursos necessários

In [None]:
from sklearn import datasets
import pandas as pd
import numpy as np
from typing import Callable

### Criando as funções com as árvores de decisão:
Cada função também descreve o tipo de entrada e o tipo de saída.

Note que eu decidi receber dentro das árvores um dicionário, essa escolha
foi puramente baseada em achar mais fácil trabalhar com dicionários.

Entretanto, cada dicionário deve conter apenas uma linha (row) do dataframe.

In [None]:
def iris_decision_tree_one(dict: dict) -> float:
  """
  function iris_decision_tree_one
  Expects a dictionary containing exactly one row from a iris dataframe as input
  Returns an float corresponding to the target as output
  0.0: Iris-setosa
  1.0: Iris-versicolor
  2.0: Iris-virginica
  """  
  if not dict['petal length (cm)'] <= 2.45:
    if dict['petal width (cm)'] <= 1.75:
      if not dict['petal length (cm)'] <= 4.95:
        if not dict['petal width (cm)'] <= 1.55:
          return 1.0
        
        return 2.0

      if not dict['petal width (cm)'] <= 1.65:
        return 2.0
      
      return 1.0

    if dict['petal length (cm)'] <= 4.85:
      if dict['sepal width (cm)'] <= 3.1:
        return 2.0
      
      return 1.0
    
    return 2.0

  return 0.0


In [None]:
def iris_decision_tree_two(dict: dict) -> float:
  """
  function iris_decision_tree_one
  Expects a dictionary containing exactly one row from a iris dataframe as input
  Returns an float corresponding to the target as output
  0.0: Iris-setosa
  1.0: Iris-versicolor
  2.0: Iris-virginica
  """
  if dict['petal length (cm)'] <= 2.45:
    return 0.0
  elif dict['petal width (cm)'] <= 1.75:
    return 1.0
  else:
    return 2.0

### Criando as funções de avaliação do desempenho

Com as árvores prontas, é hora de construir as funções que vão avaliar o desempenho.

Para isso, decidi construir cada etapa de forma mais abstrata que consegui,
podendo assim passar uma função como argumento de outra.

In [None]:
def check_answer(model_result, data: dict) -> tuple[(bool, float)]:
  """
  Returns a tuple of (bool, float)
  Bool: if the predicted species matches the actual species
  Float: Corresponding for the predicted species

      Arguments:
        model_result (float): The predicted species
        data (dict): A dictionary containing the data passed to the model

      Returns:
        tuple (bool, float): A tuple of (bool, float)
        Bool: if the predicted species matches the actual species
        Float: Corresponding for the predicted species
  
      Species:
        0.0: Iris-setosa
        1.0: Iris-versicolor
        2.0: Iris-virginica
  """
  if model_result == 0.0:
    return (data['target'] == 0.0, 0.0)
  if model_result == 1.0:
    return (data['target'] == 1.0, 1.0)
  if model_result == 2.0:
    return (data['target'] == 2.0, 2.0)

In [None]:
def confusion_matrix(model: Callable, dataframe: pd.DataFrame) -> list[list[int]]:
  """
  Returns a confusion matrix (list of lists)

      Arguments:
        model (function): A function that takes a dictionary as input and returns a float
        dataframe (pandas.DataFrame): A dataframe containing the data to be used for the model
      
      Returns:
        matrix list[list[int]]: A confusion matrix (list of lists)
  """

  matrix = [[0,0,0],[0,0,0],[0,0,0]]
  predicted_class0 = []
  predicted_class1 = []
  predicted_class2 = []

  dataframe_to_dict = dataframe.filter(['target']).to_dict()

  for key, value in dataframe_to_dict['target'].items():
    if value == 0.0:
      predicted_class0.append(check_answer(model(dataframe.iloc[key].to_dict()), dataframe.iloc[key].to_dict()))
      matrix[0][0] = predicted_class0.count((True, 0.0))
      matrix[0][1] = predicted_class0.count((False, 1.0))
      matrix[0][2] = predicted_class0.count((False, 2.0))

    if value == 1.0:
      predicted_class1.append(check_answer(model(dataframe.iloc[key].to_dict()), dataframe.iloc[key].to_dict()))
      matrix[1][0] = predicted_class1.count((False, 0.0))
      matrix[1][1] = predicted_class1.count((True, 1.0))
      matrix[1][2] = predicted_class1.count((False, 2.0))

    if value == 2.0:
      predicted_class2.append(check_answer(model(dataframe.iloc[key].to_dict()), dataframe.iloc[key].to_dict()))
      matrix[2][0] = predicted_class2.count((False, 0.0))
      matrix[2][1] = predicted_class2.count((False, 1.0))
      matrix[2][2] = predicted_class2.count((True, 2.0))

  return matrix

In [None]:
def accuracy(confusion_matrix: list[list[int]]) -> float:
  """
  Returns the accuracy of the model
      
      Arguments:
          confusion_matrix (list): Confusion matrix (list of lists)
      
      Returns:
          accuracy (float): Accuracy of the model
  
      Formula: accuracy = (TP + TN) / (TP + TN + FP + FN)
  """

  sum_correct = confusion_matrix[0][0] + confusion_matrix[1][1] + confusion_matrix[2][2]
  sum_all = 0
  for i in range(0,3):
    for j in range(3):
      sum_all += confusion_matrix[i][j]
  return sum_correct/sum_all

In [None]:
def precision(matrix: list, class_index: int) -> float:
  """
  Returns the precision of the model for the given class (iris species)

      Arguments:
          matrix (list): Confusion matrix (list of lists)
          class_index (int): Index of the class (0, 1, 2)
            0: Iris-setosa
            1: Iris-versicolor
            2: Iris-virginica
      
      Returns:
          precision (float): Precision of the model for the given class
          if the class is not in the supported list [0, 1, 2] returns -1
  
      Formula: precision = TP / (TP + FP)
  """
  
  if class_index not in [0, 1, 2]:
    return -1
  return (matrix[class_index][class_index]/(matrix[class_index][0] 
                + matrix[class_index][1] + matrix[class_index][2]))

In [None]:
def recall(matrix: list, class_index: int):
  """
  Returns the recall of the model for the given class (iris species)

      Arguments:
          matrix (list): Confusion matrix (list of lists)
          class_index (int): Index of the class (0, 1, 2)
            0: Iris-setosa
            1: Iris-versicolor
            2: Iris-virginica
      
      Returns:
          recall (float): Recall of the model for the given class
  
      Formula: recall = TP / (TP + FN)
  """
  
  return (
          matrix[class_index][class_index]
          /(matrix[0][class_index] 
          + matrix[1][class_index] 
          + matrix[2][class_index]))

In [None]:
def f1_score(matrix: list, class_index: int):
  """
  Returns the f1 score of the model for the given class (iris species)
  
      Arguments:
          matrix (list): Confusion matrix (list of lists)
          class_index (int): Index of the class (0, 1, 2)
            0: Iris-setosa
            1: Iris-versicolor
            2: Iris-virginica
      
      Returns:
          f1_score (float): F1 score of the model for the given class
      
      Formula: f1_score = 2 * (precision * recall) / (precision + recall)
  """
  
  return (
    2* (precision(matrix, class_index) * recall(matrix, class_index)) 
    / (precision(matrix, class_index) + recall(matrix, class_index)))

### Carregando o dataset Iris
Agora vamos carregar o dataset Iris que virá do SKLearn

In [None]:
iris = datasets.load_iris()
dados = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])


### Hora de testar!

Por fim, vamos rodar o código e avaliar os dois modelos primeiramente com
uma matriz de confusão.

In [None]:
## avaliando o modelo 1 por confusion matrix
model1_matrix = confusion_matrix(iris_decision_tree_one, dados)
model2_matrix = confusion_matrix(iris_decision_tree_two, dados)

print(f'''
Confusion Matrix Modelo 1:
{model1_matrix}
Confusion Matrix Modelo 2:
{model2_matrix}
''')

Agora que temos as matrizes de confusão prontas, vamos avaliar os modelos
por acurácia.

Note que a partir daqui, todos os resultados serão arredondados até a segunda
casa decimal.

In [None]:
## avaliando os modelos 1 e 2 por acurácia
model1_accuracy = accuracy(model1_matrix)
model2_accuracy = accuracy(model2_matrix)

print(f'''
Acurácia Modelo 1:
{model1_accuracy:.2}
Acurácia Modelo 2:
{model2_accuracy:.2}
''')

Agora, vamos avaliar cada modelo com base em sua precisão.

In [None]:
##avaliando os modelos 1 e 2 por precisão
model1_precision_0 = precision(model1_matrix, 0)
model1_precision_1 = precision(model1_matrix, 1)
model1_precision_2 = precision(model1_matrix, 2)

model2_precision_0 = precision(model2_matrix, 0)
model2_precision_1 = precision(model2_matrix, 1)
model2_precision_2 = precision(model2_matrix, 2)

print(f'''
Precisão do Modelo 1:
Iris Setosa | Iris Versicolor | Iris Virginica
{' '*5}{model1_precision_0:.2}{' '*3} | {' '*6}{model1_precision_1:.2}{' '*6} | {model1_precision_2:.2}

Precisão do Modelo 2:
Iris Setosa | Iris Versicolor | Iris Virginica
{' '*5}{model2_precision_0:.2}{' '*3} | {' '*6}{model2_precision_1:.2}{' '*6} | {model2_precision_2:.2}
''')

Aproveitando que já estamos aqui, por quê não calcular o recall de cada modelo 
para cada feature?

In [None]:
model1_recall_0 = recall(model1_matrix, 0)
model1_recall_1 = recall(model1_matrix, 1)
model1_recall_2 = recall(model1_matrix, 2)

model2_recall_0 = recall(model2_matrix, 0)
model2_recall_1 = recall(model2_matrix, 1)
model2_recall_2 = recall(model2_matrix, 2)

print(f'''
Recall do Modelo 1:
Iris Setosa | Iris Versicolor | Iris Virginica
{' '*5}{model1_recall_0:.2}{' '*3} | {' '*6}{model1_recall_1:.2}{' '*6} | {model1_recall_2:.2}

Recall do Modelo 2:
Iris Setosa | Iris Versicolor | Iris Virginica
{' '*5}{model2_recall_0:.2}{' '*3} | {' '*6}{model2_recall_1:.2}{' '*6} | {model2_recall_2:.2}
''')

E pra fechar com chave de ouro, vamos calcular o f1-score de cada modelo 
para cada feature?

In [None]:
model1_f1score_0 = f1_score(model1_matrix, 0)
model1_f1score_1 = f1_score(model1_matrix, 1)
model1_f1score_2 = f1_score(model1_matrix, 2)

model2_f1score_0 = f1_score(model2_matrix, 0)
model2_f1score_1 = f1_score(model2_matrix, 1)
model2_f1score_2 = f1_score(model2_matrix, 2)

print(f'''
F1-Score do Modelo 1:
Iris Setosa | Iris Versicolor | Iris Virginica
{' '*5}{model1_f1score_0:.2}{' '*3} | {' '*6}{model1_f1score_1:.2}{' '*6} | {model1_f1score_2:.2}

F1-Score do Modelo 2:
Iris Setosa | Iris Versicolor | Iris Virginica
{' '*5}{model2_f1score_0:.2}{' '*3} | {' '*6}{model2_f1score_1:.2}{' '*6} | {model2_f1score_2:.2}
''')