# Import

## Setup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set_theme(style="darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Apple-Quality"
COLAB = 'google.colab' in sys.modules

DEBUG = False
SEED = 666

In [6]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [7]:
df = pd.read_pickle(f"{ROOT}/data/apple.pickle")

In [8]:
print(df.shape)
df.head()

(4000, 8)


Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Target
0,-1.798424,-0.950373,2.993421,-1.42415,0.690545,-0.089872,-0.269415,0
1,-0.35906,-1.154404,2.127698,0.429746,0.176767,0.19702,-0.378997,0
2,0.109445,-0.225759,-0.652507,-0.946892,1.205422,-0.286156,1.206044,1
3,-0.079977,-0.800146,0.923916,-0.772399,1.619575,-2.08732,0.338315,0
4,0.968573,-0.19164,0.044164,-1.096894,1.305025,-0.961548,0.201472,0


# Baseline Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

## Split

In [10]:
from sklearn.model_selection import train_test_split

X = df.drop('Target', axis=1)  
y = df['Target']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [20]:
from sklearn.preprocessing import StandardScaler

classifiers = {
    "KNN": KNeighborsClassifier(),
    "KNN(3)": KNeighborsClassifier(n_neighbors=3),
    "RF": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "LGBM": LGBMClassifier(verbose=-1),
    "XGB": XGBClassifier()
}

scaler = StandardScaler()

## Train

In [21]:
X_train_scaled = scaler.fit_transform(X_train)

train_results = []

for name, model in classifiers.items():
    model.fit(X_train, y_train)
    train_accuracy = accuracy_score(y_train, model.predict(X_train)) * 100
    train_results.append([name, 'Before Scaling', f"{train_accuracy:.2f}%"])

for name, model in classifiers.items():
    model.fit(X_train_scaled, y_train)
    train_accuracy = accuracy_score(y_train, model.predict(X_train_scaled)) * 100
    train_results.append([name, 'After Scaling', f"{train_accuracy:.2f}%"])

train_df = pd.DataFrame(train_results, columns=['Model', 'Scaling', 'Train Accuracy'])

print("\nComparação Train Accuracy:\n")
print(train_df.to_string(index=False))



Comparação Train Accuracy:

 Model        Scaling Train Accuracy
   KNN Before Scaling         92.81%
KNN(3) Before Scaling         94.03%
    RF Before Scaling        100.00%
   SVM Before Scaling         90.22%
  LGBM Before Scaling         99.12%
   XGB Before Scaling        100.00%
   KNN  After Scaling         92.81%
KNN(3)  After Scaling         94.06%
    RF  After Scaling        100.00%
   SVM  After Scaling         90.16%
  LGBM  After Scaling         98.84%
   XGB  After Scaling        100.00%


## Test

In [23]:
X_test_scaled = scaler.transform(X_test)

test_results = []

for name, model in classifiers.items():
    model.fit(X_train, y_train)
    test_accuracy = accuracy_score(y_test, model.predict(X_test)) * 100
    test_results.append([name, 'Before Scaling', f"{test_accuracy:.2f}%"])

for name, model in classifiers.items():
    model.fit(X_train_scaled, y_train)
    test_accuracy = accuracy_score(y_test, model.predict(X_test_scaled)) * 100
    test_results.append([name, 'After Scaling', f"{test_accuracy:.2f}%"])

test_df = pd.DataFrame(test_results, columns=['Model', 'Scaling', 'Test Accuracy'])

print("\nComparação Test Accuracy:\n")
print(test_df.to_string(index=False))


Comparação Test Accuracy:

 Model        Scaling Test Accuracy
   KNN Before Scaling        91.50%
KNN(3) Before Scaling        89.75%
    RF Before Scaling        91.12%
   SVM Before Scaling        90.25%
  LGBM Before Scaling        89.62%
   XGB Before Scaling        90.75%
   KNN  After Scaling        91.25%
KNN(3)  After Scaling        90.00%
    RF  After Scaling        90.75%
   SVM  After Scaling        90.50%
  LGBM  After Scaling        90.62%
   XGB  After Scaling        90.75%


## Resultados

#### **Comparação Train Accuracy**
##### Antes da normalização:

- **KNN** = 92.69%         
- **KNN(3)** = 94.38%         
- **RF** = 100.00%        
- **SVM** = 90.16%         
- **LGBM** = 98.84%         
- **XGB** = 100.00%   

##### Depois da normalização:

- **KNN** = 92.81%        
- **KNN(3)** = 94.06%        
- **RF** = 100.00%        
- **SVM** = 90.16%         
- **LGBM** = 98.84%         
- **XGB** = 100.00%        

#### **Comparação Test Accuracy**
##### Antes da normalização:

- **KNN** = 91.12%   
- **KNN(3)** = 90.25%         
- **RF** = 90.50%         
- **SVM** = 90.50%         
- **LGBM** = 90.62%         
- **XGB** = 90.75%         

##### Depois da normalização:
- **KNN** = 91.25%         
- **KNN(3)** = 90.00%         
- **RF** = 90.50%         
- **SVM** = 90.50%         
- **LGBM** = 90.62%         
- **XGB** = 90.75%         

#### **Observações:**

- **KNN e KNN(3):** Observou-se uma leve melhoria na acurácia de teste após a normalização para o KNN básico, enquanto o KNN com 3 vizinhos teve uma ligeira queda.
- **Random Forest (RF):** Manteve uma acurácia de treino perfeita e uma acurácia de teste estável.
- **SVM:** A acurácia foi consistente antes e depois da normalização.
- **LGBM e XGB:** Ambos mostraram alta acurácia em treino e teste, com resultados consistentes antes e depois da normalização.


# Questões

**Pergunta:** Qual é a diferença entre dados de treinamento e dados de teste ao treinar um modelo de classificação?


Dados de treinamento são usados para ajustar o modelo e aprender padrões, enquanto dados de teste são usados para avaliar a performance do modelo em dados não vistos, garantindo que ele generalize bem para novas amostras.

**Pergunta:** Com base no dataset fornecido, a abordagem do treinamento do modelo deve ser `supervisionada` ou `não-supervisionada`? Por quê?


A abordagem do treinamento do modelo deve ser **supervisionada**. Isso porque o modelo está sendo treinado para prever um alvo (`Target`) com base em características (`X`), o que implica que o dataset inclui rótulos conhecidos para aprendizado supervisionado.