In [None]:
# SENTENCE-BERT - Preprocessing & Generate Embeddings
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('adaKami-reviews.csv', index_col=0)
df = df[df['review'].notna()].reset_index(drop=True)
print(f"Total reviews: {len(df)}")

# Preprocessing RINGAN untuk Sentence-BERT
# PENTING: Jangan terlalu agresif! Sentence-BERT butuh konteks kalimat
def light_preprocess(text):
    """
    Preprocessing ringan untuk Sentence-BERT:
    - Hapus URL, email, mention
    - Hapus karakter berlebihan (!!!, ???)
    - Normalisasi whitespace
    - JANGAN hapus stopwords (penting untuk konteks)
    - JANGAN lowercase semua (kapitalisasi bisa penting)
    """
    if pd.isna(text):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Remove email
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Normalize excessive punctuation (!!!! -> !, ???? -> ?)
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    text = re.sub(r'\.{2,}', '.', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing
print("\nPreprocessing text...")
df['processed_review'] = df['review'].apply(light_preprocess)

# Remove empty reviews
df = df[df['processed_review'].str.len() > 10].reset_index(drop=True)
print(f"After preprocessing: {len(df)} reviews")

# Show examples
print("\n" + "="*80)
print("CONTOH PREPROCESSING:")
print("="*80)
for i in range(min(3, len(df))):
    print(f"\nORIGINAL:\n{df['review'].iloc[i][:150]}...")
    print(f"\nPROCESSED:\n{df['processed_review'].iloc[i][:150]}...")
    print("-"*80)

# Load Sentence-BERT model (multilingual untuk bahasa Indonesia)
print("\nLoading Sentence-BERT model...")
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings dari text yang sudah dipreprocessing
print("\nGenerating embeddings from preprocessed text...")
embeddings = model.encode(df['processed_review'].tolist(), show_progress_bar=True, batch_size=32)
print(f"Embeddings shape: {embeddings.shape}")
print("\n✓ Embeddings berhasil dibuat! Lanjut ke cell berikutnya untuk elbow method.")


In [None]:
# ELBOW METHOD - Mencari Optimal K
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Calculate inertia dan silhouette score untuk berbagai K
k_range = range(2, 11)
inertias = []
silhouette_scores = []

print("Menghitung Elbow Method dan Silhouette Score...")
print("="*60)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    labels = kmeans.fit_predict(embeddings)
    inertias.append(kmeans.inertia_)
    sil_score = silhouette_score(embeddings, labels)
    silhouette_scores.append(sil_score)
    print(f"K={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={sil_score:.4f}")

# Visualisasi
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Elbow Method
axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (K)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Inertia (WCSS)', fontsize=12, fontweight='bold')
axes[0].set_title('Elbow Method - Sentence-BERT Embeddings', fontsize=14, fontweight='bold', pad=15)
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(k_range)

# Plot 2: Silhouette Score
axes[1].plot(k_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (K)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Silhouette Score', fontsize=12, fontweight='bold')
axes[1].set_title('Silhouette Score Analysis', fontsize=14, fontweight='bold', pad=15)
axes[1].grid(True, alpha=0.3)
axes[1].set_xticks(k_range)
best_k = k_range[silhouette_scores.index(max(silhouette_scores))]
axes[1].axvline(x=best_k, color='red', linestyle='--', linewidth=2, 
                label=f'Best K={best_k} (Score={max(silhouette_scores):.4f})')
axes[1].legend(fontsize=10)

plt.tight_layout()
plt.show()

print(f"\n{'='*60}")
print(f"Rekomendasi K berdasarkan Silhouette Score: K={best_k}")
print(f"{'='*60}")

# Update clustering dengan K optimal
optimal_k = best_k
print(f"\nMenggunakan K={optimal_k} untuk clustering...")
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(embeddings)
silhouette = silhouette_score(embeddings, df['cluster'])
print(f"Silhouette Score: {silhouette:.4f}")
print(f"\nCluster Distribution:")
print(df['cluster'].value_counts().sort_index())

# UMAP untuk visualisasi
print("\nReducing dimensions with UMAP...")
reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
embeddings_2d = reducer.fit_transform(embeddings)
df['umap1'] = embeddings_2d[:, 0]
df['umap2'] = embeddings_2d[:, 1]


In [None]:
# VISUALISASI CLUSTERING
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Plot 1: Cluster Visualization
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E2']
for cluster in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster]
    axes[0].scatter(cluster_data['umap1'], cluster_data['umap2'], 
                   c=colors[cluster % len(colors)], 
                   label=f'Cluster {cluster} (n={len(cluster_data)})',
                   alpha=0.7, s=60, edgecolors='black', linewidth=0.5)

axes[0].set_xlabel('UMAP 1', fontsize=13, fontweight='bold')
axes[0].set_ylabel('UMAP 2', fontsize=13, fontweight='bold')
axes[0].set_title(f'Sentence-BERT Clustering (K={optimal_k})\nSilhouette Score: {silhouette:.4f}', 
                  fontsize=15, fontweight='bold', pad=15)
axes[0].legend(fontsize=10, loc='best', framealpha=0.9)
axes[0].grid(True, alpha=0.3)

# Plot 2: Rating Distribution per Cluster
cluster_rating = df.groupby(['cluster', 'rating']).size().unstack(fill_value=0)
cluster_rating.plot(kind='bar', stacked=True, ax=axes[1], colormap='RdYlGn', width=0.7)
axes[1].set_xlabel('Cluster', fontsize=13, fontweight='bold')
axes[1].set_ylabel('Count', fontsize=13, fontweight='bold')
axes[1].set_title('Rating Distribution per Cluster', fontsize=15, fontweight='bold', pad=15)
axes[1].legend(title='Rating', fontsize=10, title_fontsize=11)
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Sample reviews per cluster
print("\n" + "="*80)
print("SAMPLE REVIEWS DARI SETIAP CLUSTER:")
print("="*80)
for cluster in sorted(df['cluster'].unique()):
    print(f"\n{'='*80}")
    print(f"CLUSTER {cluster} - Total: {len(df[df['cluster']==cluster])} reviews")
    print(f"{'='*80}")
    samples = df[df['cluster'] == cluster].sample(min(3, len(df[df['cluster']==cluster])), random_state=42)
    for idx, (i, row) in enumerate(samples.iterrows(), 1):
        print(f"\n{idx}. Rating: {row['rating']} ⭐")
        print(f"   Original : {row['review'][:180]}...")
        print(f"   Processed: {row['processed_review'][:180]}...")
    print()
