In [None]:
# 1. PREPROCESSING
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Load data
df = pd.read_csv('adaKami-reviews.csv', index_col=0)
print(f"Total reviews: {len(df)}")
print(f"\nFirst few rows:")
print(df.head())

# Text preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Remove stopwords
stop_words = set(stopwords.words('indonesian') + stopwords.words('english'))
custom_stopwords = {'yang', 'di', 'ke', 'dari', 'untuk', 'ini', 'itu', 'dengan', 'dan', 'atau', 'pada'}
stop_words.update(custom_stopwords)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(filtered_words)

df['processed_review'] = df['cleaned_review'].apply(remove_stopwords)

# Remove empty reviews
df = df[df['processed_review'].str.len() > 0].reset_index(drop=True)

print(f"\nAfter preprocessing: {len(df)} reviews")
print(f"\nSample processed reviews:")
print(df[['review', 'processed_review']].head())

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.8)
X = vectorizer.fit_transform(df['processed_review'])

print(f"\nTF-IDF matrix shape: {X.shape}")
print(f"Number of features: {len(vectorizer.get_feature_names_out())}")


In [None]:
# 2. WORDCLOUD
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all processed reviews into one text
all_text = ' '.join(df['processed_review'])

# Generate WordCloud
wordcloud = WordCloud(width=1200, height=600, 
                      background_color='white',
                      colormap='viridis',
                      max_words=100,
                      relative_scaling=0.5,
                      min_font_size=10).generate(all_text)

# Display WordCloud
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - AdaKami Reviews', fontsize=20, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Top 20 most frequent words
from collections import Counter
words = all_text.split()
word_freq = Counter(words).most_common(20)

print("\nTop 20 Most Frequent Words:")
for word, freq in word_freq:
    print(f"{word}: {freq}")


In [None]:
# 3. CLUSTERING
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Calculate inertia for elbow method (k from 2 to 10)
inertias = []
k_range = range(2, 11)

print("Training K-Means models...")
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
    print(f"K={k}: Inertia = {kmeans.inertia_:.2f}")

# Perform clustering with optimal k (let's use k=4 as default)
optimal_k = 4
print(f"\n{'='*50}")
print(f"Performing K-Means Clustering with K={optimal_k}")
print(f"{'='*50}")

kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
df['cluster'] = kmeans_final.fit_predict(X)

# Display cluster distribution
print(f"\nCluster Distribution:")
print(df['cluster'].value_counts().sort_index())

# Show sample reviews from each cluster
print(f"\n{'='*50}")
print("Sample Reviews from Each Cluster:")
print(f"{'='*50}")
for cluster in sorted(df['cluster'].unique()):
    print(f"\n--- CLUSTER {cluster} ---")
    samples = df[df['cluster'] == cluster].head(3)
    for idx, row in samples.iterrows():
        print(f"Review: {row['review'][:150]}...")
        print(f"Rating: {row['rating']}")
        print()

# PCA for visualization (reduce to 2D)
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())
df['pca1'] = X_pca[:, 0]
df['pca2'] = X_pca[:, 1]

print(f"\nPCA explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.4f}")


In [None]:
# 4. VISUALISASI (Elbow Method & Cluster Visualization)
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (16, 6)

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# --- Plot 1: Elbow Method ---
axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (K)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Inertia (Within-Cluster Sum of Squares)', fontsize=12, fontweight='bold')
axes[0].set_title('Elbow Method - Optimal K Selection', fontsize=14, fontweight='bold', pad=15)
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(k_range)

# Highlight the optimal k
axes[0].axvline(x=optimal_k, color='red', linestyle='--', linewidth=2, label=f'Optimal K = {optimal_k}')
axes[0].legend(fontsize=10)

# --- Plot 2: Cluster Visualization (2D PCA) ---
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F', '#BB8FCE', '#85C1E2']
for cluster in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster]
    axes[1].scatter(cluster_data['pca1'], cluster_data['pca2'], 
                   c=colors[cluster % len(colors)], 
                   label=f'Cluster {cluster}',
                   alpha=0.6, s=50, edgecolors='black', linewidth=0.5)

axes[1].set_xlabel('First Principal Component', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Second Principal Component', fontsize=12, fontweight='bold')
axes[1].set_title(f'K-Means Clustering Visualization (K={optimal_k})', fontsize=14, fontweight='bold', pad=15)
axes[1].legend(fontsize=10, loc='best')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Additional: Rating distribution per cluster
fig, ax = plt.subplots(figsize=(12, 6))
cluster_rating = df.groupby(['cluster', 'rating']).size().unstack(fill_value=0)
cluster_rating.plot(kind='bar', stacked=True, ax=ax, colormap='viridis', width=0.7)
ax.set_xlabel('Cluster', fontsize=12, fontweight='bold')
ax.set_ylabel('Count', fontsize=12, fontweight='bold')
ax.set_title('Rating Distribution per Cluster', fontsize=14, fontweight='bold', pad=15)
ax.legend(title='Rating', fontsize=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# 5. SILHOUETTE SCORE
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# Calculate silhouette scores for different K values
silhouette_scores = []

print("Calculating Silhouette Scores for different K values...")
print("="*60)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f"K={k}: Silhouette Score = {silhouette_avg:.4f}")

print(f"\n{'='*60}")
print(f"Best K based on Silhouette Score: K={k_range[np.argmax(silhouette_scores)]} "
      f"(Score: {max(silhouette_scores):.4f})")
print(f"{'='*60}")

# Visualize Silhouette Scores
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Silhouette Score vs K
axes[0].plot(k_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (K)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Silhouette Score', fontsize=12, fontweight='bold')
axes[0].set_title('Silhouette Score Analysis', fontsize=14, fontweight='bold', pad=15)
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(k_range)
axes[0].axhline(y=max(silhouette_scores), color='red', linestyle='--', alpha=0.5, 
                label=f'Max Score = {max(silhouette_scores):.4f}')
axes[0].legend(fontsize=10)

# Plot 2: Silhouette Plot for optimal K
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
cluster_labels = kmeans.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
sample_silhouette_values = silhouette_samples(X, cluster_labels)

y_lower = 10
colors = cm.nipy_spectral(cluster_labels.astype(float) / optimal_k)

for i in range(optimal_k):
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
    ith_cluster_silhouette_values.sort()
    
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    color = cm.nipy_spectral(float(i) / optimal_k)
    axes[1].fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
    
    axes[1].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontweight='bold')
    y_lower = y_upper + 10

axes[1].set_xlabel('Silhouette Coefficient Values', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Cluster Label', fontsize=12, fontweight='bold')
axes[1].set_title(f'Silhouette Plot for K={optimal_k}\n(Avg Score: {silhouette_avg:.4f})', 
                  fontsize=14, fontweight='bold', pad=15)
axes[1].axvline(x=silhouette_avg, color='red', linestyle='--', linewidth=2, 
                label=f'Average Score = {silhouette_avg:.4f}')
axes[1].set_yticks([])
axes[1].legend(fontsize=10)

plt.tight_layout()
plt.show()

# Summary statistics
print(f"\nSilhouette Score Summary for K={optimal_k}:")
print(f"{'='*60}")
print(f"Average Silhouette Score: {silhouette_avg:.4f}")
print(f"Min Silhouette Score: {sample_silhouette_values.min():.4f}")
print(f"Max Silhouette Score: {sample_silhouette_values.max():.4f}")
print(f"Std Silhouette Score: {sample_silhouette_values.std():.4f}")
print(f"\nSilhouette Score per Cluster:")
for i in range(optimal_k):
    cluster_silhouette = sample_silhouette_values[cluster_labels == i]
    print(f"  Cluster {i}: {cluster_silhouette.mean():.4f} (size: {len(cluster_silhouette)})")
