# Clusterização

In [None]:
!pip install kagglehub[pandas-datasets]



In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [2]:
# Título do arquivo a ser descarregado
file_path = "wine-clustering.csv"

In [3]:
# Coleta de dados e armazenamento em dataframe Pandas
df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,"harrywang/wine-dataset-for-clustering",file_path)

  df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,"harrywang/wine-dataset-for-clustering",file_path)


Using Colab cache for faster access to the 'wine-dataset-for-clustering' dataset.


In [4]:
df.shape

(178, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Alcohol               178 non-null    float64
 1   Malic_Acid            178 non-null    float64
 2   Ash                   178 non-null    float64
 3   Ash_Alcanity          178 non-null    float64
 4   Magnesium             178 non-null    int64  
 5   Total_Phenols         178 non-null    float64
 6   Flavanoids            178 non-null    float64
 7   Nonflavanoid_Phenols  178 non-null    float64
 8   Proanthocyanins       178 non-null    float64
 9   Color_Intensity       178 non-null    float64
 10  Hue                   178 non-null    float64
 11  OD280                 178 non-null    float64
 12  Proline               178 non-null    int64  
dtypes: float64(11), int64(2)
memory usage: 18.2 KB


In [6]:
df.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [7]:
# Selecionar apenas colunas numéricas
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

In [8]:
# Preparar dados para clusterização
X = df[numeric_columns]

In [9]:
# Normalizar os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Reduzir para duas dimensões
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [11]:
# Encontrar número ideal de clusters
k_range = range(2, 9) # Testa de 2 a 8 clusters
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)
    print(f"k={k}: Silhouette Score = {score:.3f}")

k=2: Silhouette Score = 0.265
k=3: Silhouette Score = 0.285
k=4: Silhouette Score = 0.254
k=5: Silhouette Score = 0.184
k=6: Silhouette Score = 0.169
k=7: Silhouette Score = 0.173
k=8: Silhouette Score = 0.163


In [12]:
# Encontrar melhor k
best_k = k_range[np.argmax(silhouette_scores)]
best_score = max(silhouette_scores)
print(best_score)

0.2848589191898987


In [13]:
# Aplicar K-Means com melhor k
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
clusters = kmeans_final.fit_predict(X_scaled)

In [14]:
print(f"Distribuição dos clusters:")

cluster_counts = pd.Series(clusters).value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
    percentage = (count / len(clusters)) * 100
    print(f"  Cluster {cluster_id}: {count} amostras ({percentage:.1f}%)")

Distribuição dos clusters:
  Cluster 0: 65 amostras (36.5%)
  Cluster 1: 51 amostras (28.7%)
  Cluster 2: 62 amostras (34.8%)
