In [None]:
import kagglehub
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

## Dados

In [None]:
path = kagglehub.dataset_download("muratkokludataset/pumpkin-seeds-dataset")

print("Caminho para os arquivos do dataset:", path)

Using Colab cache for faster access to the 'pumpkin-seeds-dataset' dataset.
Caminho para os arquivos do dataset: /kaggle/input/pumpkin-seeds-dataset


In [None]:
sub_dir = os.listdir(path)[0]
actual_data_path = os.path.join(path, sub_dir)

files_in_sub_dir = os.listdir(actual_data_path)
excel_files = [f for f in files_in_sub_dir if f.endswith('.xlsx')]

if not excel_files:
    raise FileNotFoundError(f"Nenhum arquivo .xlsx encontrado em {actual_data_path}")

excel_file_name = excel_files[0]
full_path = os.path.join(actual_data_path, excel_file_name)

In [None]:
df = pd.read_excel(full_path)

print(f"Arquivo carregado: {excel_file_name}")
display(df.head())

Arquivo carregado: Pumpkin_Seeds_Dataset.xlsx


Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,Çerçevelik
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,Çerçevelik
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,Çerçevelik
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,Çerçevelik
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,Çerçevelik


## Normalização dos Dados

In [None]:
from sklearn.preprocessing import MinMaxScaler

numerical_cols = df.columns[:-1]

scaler = MinMaxScaler()

df_normalized = df.copy()
df_normalized[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("DataFrame com colunas numéricas normalizadas:")
display(df_normalized.head())

DataFrame com colunas numéricas normalizadas:


Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,0.09406,0.028593,0.015551,0.443011,0.094037,0.121346,0.538377,0.944591,0.766869,0.887532,0.166458,0.755304,Çerçevelik
1,0.32371,0.28896,0.282492,0.534065,0.321202,0.384261,0.735526,0.963061,0.683352,0.751688,0.316881,0.546062,Çerçevelik
2,0.267208,0.310438,0.337143,0.383178,0.269913,0.323187,0.839474,0.885224,0.752212,0.552727,0.459187,0.3839,Çerçevelik
3,0.208936,0.178831,0.178027,0.457938,0.208314,0.257917,0.702193,0.944591,0.751106,0.763636,0.28356,0.585876,Çerçevelik
4,0.204975,0.187652,0.184843,0.444415,0.208303,0.25339,0.716228,0.875989,0.573009,0.725195,0.296938,0.566405,Çerçevelik


# Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split

X = df_normalized.drop('Class', axis=1)
y = df_normalized['Class']

print("Shape das features (X):", X.shape)
print("Shape do target (y):", y.shape)

Shape das features (X): (2500, 12)
Shape do target (y): (2500,)


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Shape das features de treino:", X_train.shape)
print("Shape do target de treino:", y_train.shape)
print("Shape das features temporárias:", X_temp.shape)
print("Shape do target temporário:", y_temp.shape)

Shape das features de treino: (1750, 12)
Shape do target de treino: (1750,)
Shape das features temporárias: (750, 12)
Shape do target temporário: (750,)


In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Shape das features de validação:", X_val.shape)
print("Shape do target de validação:", y_val.shape)
print("Shape das features de teste:", X_test.shape)
print("Shape do target de teste:", y_test.shape)

Shape das features de validação: (375, 12)
Shape do target de validação: (375,)
Shape das features de teste: (375, 12)
Shape do target de teste: (375,)


In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()

gnb_classifier.fit(X_train, y_train)

print("Classificador Gaussian Naive Bayes inicializado e treinado com sucesso.")

Classificador Gaussian Naive Bayes inicializado e treinado com sucesso.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_test = gnb_classifier.predict(X_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average='weighted')
recall_test = recall_score(y_test, y_pred_test, average='weighted')
f1_test = f1_score(y_test, y_pred_test, average='weighted')

print(f"Acurácia de Teste: {accuracy_test:.4f}")
print(f"Precisão de Teste: {precision_test:.4f}")
print(f"Recall de Teste: {recall_test:.4f}")
print(f"F1-score de Teste: {f1_test:.4f}")

Acurácia de Teste: 0.8693
Precisão de Teste: 0.8703
Recall de Teste: 0.8693
F1-score de Teste: 0.8691


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_val = gnb_classifier.predict(X_val)

accuracy_val = accuracy_score(y_val, y_pred_val)
precision_val = precision_score(y_val, y_pred_val, average='weighted')
recall_val = recall_score(y_val, y_pred_val, average='weighted')
f1_val = f1_score(y_val, y_pred_val, average='weighted')

print(f"Acurácia de Validação: {accuracy_val:.4f}")
print(f"Precisão de Validação: {precision_val:.4f}")
print(f"Recall de Validação: {recall_val:.4f}")
print(f"F1-score de Validação: {f1_val:.4f}")

Acurácia de Validação: 0.8533
Precisão de Validação: 0.8542
Recall de Validação: 0.8533
F1-score de Validação: 0.8530
