In [None]:
# -*- coding: utf-8 -*-
"""Pertemuan9_Clustering_StudiKasus.ipynb
Studi Kasus: Segmentasi Pelanggan Mall
"""

# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
warnings.filterwarnings('ignore')

print("Library berhasil diimport!")

In [None]:
# Load Dataset
# Sumber data: https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python?resource=download
url = "/content/dataset/Mall_Customers.csv"
df = pd.read_csv(url)

print("Shape Dataset:", df.shape)
print("\n 5 Data Teratas:")
df.head()

In [None]:
# Eksplorasi Data
print("Info Dataset:")
df.info()

print("\n Statistik Deskriptif:")
df.describe()

# Visualisasi sebaran data awal
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)')
plt.title('Sebaran Data Pelanggan: Annual Income vs Spending Score')
plt.grid(True)
plt.show()

In [None]:
# Preprocessing Data
# Persiapan Data untuk Clustering
# Pilih fitur yang akan digunakan
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Standardisasi data (penting untuk K-Means dan DBSCAN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Shape data setelah scaling:", X_scaled.shape)
print("Contoh 5 data setelah scaling:")
print(X_scaled[:5])

# Data sudah siap untuk proses clustering!

In [None]:
# MODEL 1: K-Means Clustering
print("--- MODEL 1: K-MEANS CLUSTERING ---")

# Menentukan jumlah cluster K dengan Elbow Method
wcss = [] # Within-Cluster Sum of Square

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method untuk Menentukan Jumlah Cluster Optimal')
plt.grid(True)
plt.show()

In [None]:
# Implementasi & Visualisasi K-Means
# Berdasarkan elbow method, pilih K=5
kmeans_final = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_kmeans = kmeans_final.fit_predict(X_scaled)

# Tambahkan label cluster ke dataframe awal
df['Cluster_KMeans'] = y_kmeans

# Visualisasi hasil clustering K-Means
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster_KMeans', palette='viridis', s=60)
plt.scatter(kmeans_final.cluster_centers_[:, 0], kmeans_final.cluster_centers_[:, 1], s=300, c='red', marker='X', label='Centroids')
plt.title('Hasil Clustering K-Means (Scaled Data)')
plt.legend()
plt.grid(True)

# Plot dalam data asli (unscaled) untuk interpretasi lebih mudah
centers_original = scaler.inverse_transform(kmeans_final.cluster_centers_)

plt.subplot(1, 2, 2)
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster_KMeans', palette='viridis', s=60)
plt.scatter(centers_original[:, 0], centers_original[:, 1], s=300, c='red', marker='X', label='Centroids')
plt.title('Hasil Clustering K-Means (Original Data)')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Evaluasi K-Means
silhouette_kmeans = silhouette_score(X_scaled, y_kmeans)
davies_bouldin_kmeans = davies_bouldin_score(X_scaled, y_kmeans)

print("--- EVALUASI K-MEANS ---")
print(f"Silhouette Score: {silhouette_kmeans:.3f}")
print(f"Davies-Bouldin Index: {davies_bouldin_kmeans:.3f}")

# Profil setiap cluster K-Means
print("\nProfil Cluster K-Means (Rata-rata):")
cluster_profile_kmeans = df.groupby('Cluster_KMeans')[['Annual Income (k$)', 'Spending Score (1-100)', 'Age']].mean()
print(cluster_profile_kmeans)

In [None]:
# MODEL 2: DBSCAN Clustering
print("\n--- MODEL 2: DBSCAN CLUSTERING ---")

# Implementasi DBSCAN
# Nilai eps dan min_samples bisa disesuaikan
dbscan = DBSCAN(eps=0.5, min_samples=5)
y_dbscan = dbscan.fit_predict(X_scaled)

# Tambahkan label cluster ke dataframe
df['Cluster_DBSCAN'] = y_dbscan

# Hitung jumlah cluster dan noise
n_clusters_dbscan = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
n_noise = list(y_dbscan).count(-1)

print(f"Jumlah cluster yang terbentuk: {n_clusters_dbscan}")
print(f"Jumlah titik yang dianggap noise: {n_noise}")
print(f"Label cluster unik: {np.unique(y_dbscan)}")

In [None]:
# Visualisasi hasil clustering DBSCAN
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
scatter = plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_dbscan, cmap='plasma', s=60)
plt.colorbar(scatter)
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.title('Hasil Clustering DBSCAN (Scaled Data)')
plt.grid(True)

plt.subplot(1, 2, 2)
scatter = plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], c=y_dbscan, cmap='plasma', s=60)
plt.colorbar(scatter)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Hasil Clustering DBSCAN (Original Data)')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Evaluasi DBSCAN (hanya jika ada cluster selain noise)
if n_clusters_dbscan > 0:
    # Hanya evaluasi pada titik yang bukan noise
    non_noise_mask = y_dbscan != -1
    X_non_noise = X_scaled[non_noise_mask]
    y_dbscan_non_noise = y_dbscan[non_noise_mask]

    if len(np.unique(y_dbscan_non_noise)) > 1: # Butuh minimal 2 cluster untuk silhouette score
        silhouette_dbscan = silhouette_score(X_non_noise, y_dbscan_non_noise)
        davies_bouldin_dbscan = davies_bouldin_score(X_non_noise, y_dbscan_non_noise)
    else:
        silhouette_dbscan = -1 # Nilai tidak valid jika hanya 1 cluster
        davies_bouldin_dbscan = 10 # Nilai tidak valid
else:
    silhouette_dbscan = -1
    davies_bouldin_dbscan = 10

print("--- EVALUASI DBSCAN ---")
print(f"Silhouette Score: {silhouette_dbscan:.3f}")
print(f"Davies-Bouldin Index: {davies_bouldin_dbscan:.3f}")

# Profil cluster DBSCAN (hitung rata-rata, exclude noise)
if n_clusters_dbscan > 0:
    print("\nðŸ“ˆ Profil Cluster DBSCAN (Rata-rata - Exclude Noise):")
    cluster_profile_dbscan = df[df['Cluster_DBSCAN'] != -1].groupby('Cluster_DBSCAN')[['Annual Income (k$)', 'Spending Score (1-100)', 'Age']].mean()
    print(cluster_profile_dbscan)
else:
    print("Tidak ada cluster yang terbentuk selain noise.")

In [None]:
# Perbandingan Hasil & Interpretasi
# Ringkasan Perbandingan
print("=== PERBANDINGAN HASIL CLUSTERING ===\n")

print("K-MEANS:")
print(f"  - Jumlah Cluster: 5")
print(f"  - Silhouette Score: {silhouette_kmeans:.3f}")
print(f"  - Davies-Bouldin Index: {davies_bouldin_kmeans:.3f}")

print("\nDBSCAN:")
print(f"  - Jumlah Cluster: {n_clusters_dbscan}")
print(f"  - Jumlah Noise: {n_noise}")
print(f"  - Silhouette Score: {silhouette_dbscan:.3f}")
print(f"  - Davies-Bouldin Index: {davies_bouldin_dbscan:.3f}")

# Interpretasi Cluster K-Means (Contoh)
print("\n--- INTERPRETASI SEGMEN PELANGGAN (K-MEANS) ---")
print("Cluster 0: Income Rendah, Spending Rendah -> HEMAT")
print("Cluster 1: Income Tinggi, Spending Tinggi -> TARGET PRIMER")
print("Cluster 2: Income Menengah, Spending Sedang -> BIASA SAJA")
print("Cluster 3: Income Tinggi, Spending Rendah -> PERHITUNGAN")
print("Cluster 4: Income Rendah, Spending Tinggi -> CEROBOH")