# <h3>Importação dos dados</h3>

In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab/data.csv')

# <h3>Pre-processamento dos dados</h3>

In [3]:
# Verificação de valores NaN
df.isna().sum()

id                 0
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Perimeter          0
Roundness          0
AspectRation       0
Class              0
dtype: int64

In [4]:
# Vizualização dos dados do dataset
df.head()

Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,1,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.76451,1.440796,jasmine
1,2,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,jasmine
2,3,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.46595,jasmine
3,4,3073,77.033628,51.928487,0.738639,3157,62.5513,0.783529,210.657,0.870203,1.483456,jasmine
4,5,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.51,jasmine


In [5]:
# Deleção de colunas desnecessárias
df.drop(columns=['id'], inplace=True)

In [6]:
# Mudança de nome da coluna alvo
df.rename(columns={'Class': 'target'}, inplace=True)

In [7]:
# Verificação do balanceamento da coluna target
df.target.value_counts()

jasmine    9985
Gonen      8200
Name: target, dtype: int64

*Os atributos da coluna target estão pouco desbalanceados, por isso foi definido não utilizar técnicas de balanceamento para assim manter os dados originais*

In [8]:
# Definindo as amostras para X e y
X = df[df.columns[:-1]]
y = df.target

In [9]:
# Definindo variáveis de treino e teste
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)

# <h3>Algoritmos classificadores</h3>

> Naive Bayes

In [10]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predict = nb_model.predict(X_val)

> KNN

In [11]:
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train, y_train)
knn_predict = knn_model.predict(X_val)

> Random Forest

In [12]:
forest_model = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42)
forest_model.fit(X_train, y_train)
forest_predict = forest_model.predict(X_val)

# <h3>Métricas</h3>

> Naive Bayes

In [13]:
print(classification_report(y_val, nb_predict, digits=4))

              precision    recall  f1-score   support

       Gonen     0.9962    0.9600    0.9777      2447
     jasmine     0.9684    0.9970    0.9825      3009

    accuracy                         0.9804      5456
   macro avg     0.9823    0.9785    0.9801      5456
weighted avg     0.9808    0.9804    0.9803      5456



> KNN

In [14]:
print(classification_report(y_val, knn_predict, digits=4))

              precision    recall  f1-score   support

       Gonen     0.9933    0.9714    0.9822      2447
     jasmine     0.9771    0.9947    0.9858      3009

    accuracy                         0.9842      5456
   macro avg     0.9852    0.9830    0.9840      5456
weighted avg     0.9844    0.9842    0.9842      5456



> Random Forest

In [15]:
print(classification_report(y_val, forest_predict, digits=4))

              precision    recall  f1-score   support

       Gonen     0.9886    0.9881    0.9884      2447
     jasmine     0.9904    0.9907    0.9905      3009

    accuracy                         0.9896      5456
   macro avg     0.9895    0.9894    0.9894      5456
weighted avg     0.9896    0.9896    0.9896      5456



# <h3>Considerações finais</h3>



> Todos os algoritmos classificativos aplicados ao contexto performaram muito bem quanto às métricas de precisão e recall, bem como à acurácia. Entretanto, pode-se observar uma pequena vantagem quanto ao uso do RandomForestClassifier, portanto, o modelo a ser utilizado para as demais etapas da atividade será o modelo obtido por tal algoritmo.

In [16]:
with open('modelo.pkl', 'wb') as file:
    pickle.dump(forest_model, file)

In [17]:
from google.colab import files
files.download('modelo.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>