# Text Clustering Analysis - adaKami Reviews
## Dataset: adaKami-reviews.csv

Analisis ini mencakup:
1. Preprocessing
2. WordCloud
3. Clustering dengan Sentence-Transformers Embeddings (all-mpnet-base-v2)
4. Visualisasi
5. Silhouette Score


## 1. Import Libraries dan Load Data


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Text processing and embeddings
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples

# For interactive plots
import plotly.express as px
import plotly.graph_objects as go

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load data
df = pd.read_csv('adaKami-reviews.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print(f"\nTotal reviews: {len(df)}")


## 2. Text Preprocessing


In [None]:
# Indonesian stopwords
indonesian_stopwords = set([
    'yang', 'di', 'dan', 'ini', 'itu', 'dengan', 'untuk', 'pada', 'adalah', 'dari',
    'ke', 'dalam', 'oleh', 'tidak', 'akan', 'dapat', 'ada', 'atau', 'juga', 'saya',
    'sudah', 'jika', 'saat', 'telah', 'seperti', 'bisa', 'semua', 'lebih', 'sangat',
    'hanya', 'karena', 'belum', 'masih', 'mereka', 'kita', 'dia', 'kami', 'anda',
    'nya', 'apa', 'siapa', 'kapan', 'dimana', 'bagaimana', 'mengapa', 'saja',
    'baru', 'punya', 'mau', 'pernah', 'setelah', 'selalu', 'lagi', 'sama', 'sebagai',
    'kalau', 'bila', 'tapi', 'tetapi', 'walaupun', 'sedang', 'antara', 'sekitar',
    'ketika', 'sebelum', 'maka', 'tersebut', 'bahkan', 'melalui', 'hingga', 'agar',
    'namun', 'lalu', 'kemudian', 'jadi', 'apakah', 'ia', 'mereka', 'ataupun'
])

def preprocess_text(text):
    """
    Preprocessing text untuk bahasa Indonesia:
    - Lowercase
    - Remove URLs
    - Remove mentions dan hashtags
    - Remove special characters
    - Remove extra whitespace
    - Remove stopwords
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in indonesian_stopwords and len(word) > 2]
    
    return ' '.join(words)

print("Preprocessing function defined!")


In [None]:
# Apply preprocessing
print("Starting preprocessing...")
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Remove empty reviews
df = df[df['cleaned_review'].str.len() > 0].reset_index(drop=True)

print(f"Preprocessing complete!")
print(f"Total reviews after cleaning: {len(df)}")
print("\nExample of original vs cleaned text:")
print(f"\nOriginal: {df['review'].iloc[0][:200]}...")
print(f"\nCleaned: {df['cleaned_review'].iloc[0][:200]}...")


## 3. WordCloud Visualization


In [None]:
# Combine all cleaned text
all_text = ' '.join(df['cleaned_review'])

# Create wordcloud
wordcloud = WordCloud(
    width=1600, 
    height=800,
    background_color='white',
    colormap='viridis',
    max_words=100,
    relative_scaling=0.5,
    min_font_size=10
).generate(all_text)

# Plot wordcloud
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud - adaKami Reviews', fontsize=20, fontweight='bold', pad=20)
plt.tight_layout(pad=0)
plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
plt.show()

print("WordCloud generated and saved as 'wordcloud.png'")


In [None]:
# Top 20 most frequent words
from collections import Counter

words = all_text.split()
word_freq = Counter(words)
top_words = word_freq.most_common(20)

# Create bar plot
words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])

plt.figure(figsize=(12, 6))
sns.barplot(data=words_df, x='Frequency', y='Word', palette='viridis')
plt.title('Top 20 Most Frequent Words', fontsize=16, fontweight='bold')
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Word', fontsize=12)
plt.tight_layout()
plt.savefig('top_words.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 20 words:")
print(words_df)


## 4. Sentence-Transformers Embeddings


In [None]:
# Load Sentence-Transformer model
print("Loading Sentence-Transformer model: all-mpnet-base-v2...")
print("This may take a moment on first run as the model needs to be downloaded...")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Create embeddings
print("\nGenerating embeddings for all reviews...")
print("This may take a few minutes depending on your hardware and dataset size...")

# Use original review text for better semantic understanding
embeddings = model.encode(
    df['review'].tolist(),
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Number of reviews: {embeddings.shape[0]}")


## 5. Finding Optimal Number of Clusters (Elbow Method)


In [None]:
# Elbow method to find optimal k
inertias = []
silhouette_scores = []
K_range = range(2, 11)

print("Computing elbow method and silhouette scores...")
for k in K_range:
    print(f"Testing k={k}...", end=" ")
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    kmeans.fit(embeddings)
    inertias.append(kmeans.inertia_)
    score = silhouette_score(embeddings, kmeans.labels_)
    silhouette_scores.append(score)
    print(f"Silhouette Score: {score:.4f}")

# Plot elbow curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Elbow plot
ax1.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Clusters (k)', fontsize=12)
ax1.set_ylabel('Inertia', fontsize=12)
ax1.set_title('Elbow Method For Optimal k', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Silhouette score plot
ax2.plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Clusters (k)', fontsize=12)
ax2.set_ylabel('Silhouette Score', fontsize=12)
ax2.set_title('Silhouette Score For Different k', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight')
plt.show()

# Find optimal k
optimal_k = K_range[silhouette_scores.index(max(silhouette_scores))]
print(f"\nOptimal number of clusters based on silhouette score: {optimal_k}")
print(f"Best silhouette score: {max(silhouette_scores):.4f}")


## 6. K-Means Clustering with Optimal k


In [None]:
# Use optimal k (or you can change this manually)
n_clusters = optimal_k

print(f"Performing K-Means clustering with k={n_clusters}...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=300)
df['cluster'] = kmeans.fit_predict(embeddings)

print(f"\nClustering complete!")
print(f"\nCluster distribution:")
print(df['cluster'].value_counts().sort_index())


## 7. Cluster Analysis & Top Terms


In [None]:
# Get top keywords for each cluster using word frequency
from collections import Counter

def get_top_keywords_by_frequency(n_terms=15):
    """Extract top keywords for each cluster based on word frequency"""
    cluster_keywords = {}
    
    for i in range(n_clusters):
        # Get all reviews in this cluster
        cluster_reviews = df[df['cluster'] == i]['cleaned_review']
        
        # Combine all text and count word frequencies
        all_text = ' '.join(cluster_reviews)
        words = all_text.split()
        word_freq = Counter(words)
        
        # Get top N words
        top_words = [word for word, freq in word_freq.most_common(n_terms)]
        cluster_keywords[i] = top_words
    
    return cluster_keywords

top_keywords = get_top_keywords_by_frequency(n_terms=15)

print("\n" + "="*80)
print("TOP KEYWORDS FOR EACH CLUSTER")
print("="*80)

for cluster_id, keywords in top_keywords.items():
    count = len(df[df['cluster'] == cluster_id])
    print(f"\nCluster {cluster_id} ({count} reviews):")
    print("-" * 40)
    print(", ".join(keywords))
    print("\nSample reviews:")
    samples = df[df['cluster'] == cluster_id]['review'].head(3)
    for idx, review in enumerate(samples, 1):
        print(f"  {idx}. {review[:150]}...")
    print()


In [None]:
# Cluster statistics
cluster_stats = df.groupby('cluster').agg({
    'rating': ['mean', 'count'],
    'review': lambda x: x.str.len().mean()
}).round(2)

cluster_stats.columns = ['Avg_Rating', 'Review_Count', 'Avg_Length']
cluster_stats = cluster_stats.reset_index()

print("\nCluster Statistics:")
print(cluster_stats)

# Visualize cluster statistics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Review count per cluster
axes[0].bar(cluster_stats['cluster'], cluster_stats['Review_Count'], color='skyblue', edgecolor='black')
axes[0].set_xlabel('Cluster', fontsize=12)
axes[0].set_ylabel('Number of Reviews', fontsize=12)
axes[0].set_title('Review Count per Cluster', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Average rating per cluster
axes[1].bar(cluster_stats['cluster'], cluster_stats['Avg_Rating'], color='lightcoral', edgecolor='black')
axes[1].set_xlabel('Cluster', fontsize=12)
axes[1].set_ylabel('Average Rating', fontsize=12)
axes[1].set_title('Average Rating per Cluster', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

# Average review length per cluster
axes[2].bar(cluster_stats['cluster'], cluster_stats['Avg_Length'], color='lightgreen', edgecolor='black')
axes[2].set_xlabel('Cluster', fontsize=12)
axes[2].set_ylabel('Average Review Length', fontsize=12)
axes[2].set_title('Average Review Length per Cluster', fontsize=14, fontweight='bold')
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('cluster_statistics.png', dpi=300, bbox_inches='tight')
plt.show()


## 8. Dimensionality Reduction & Visualization (PCA)


In [None]:
# Reduce dimensions using PCA for visualization
print("Applying PCA for 2D visualization...")
pca_2d = PCA(n_components=2, random_state=42)
pca_2d_result = pca_2d.fit_transform(embeddings)

df['pca_1'] = pca_2d_result[:, 0]
df['pca_2'] = pca_2d_result[:, 1]

print(f"Explained variance ratio: {pca_2d.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca_2d.explained_variance_ratio_):.4f}")


In [None]:
# Static 2D visualization with matplotlib
plt.figure(figsize=(14, 10))
scatter = plt.scatter(
    df['pca_1'], 
    df['pca_2'], 
    c=df['cluster'], 
    cmap='viridis', 
    alpha=0.6,
    s=50,
    edgecolors='black',
    linewidth=0.5
)

# Plot cluster centers
centers_2d = pca_2d.transform(kmeans.cluster_centers_)
plt.scatter(
    centers_2d[:, 0], 
    centers_2d[:, 1], 
    c='red', 
    marker='X', 
    s=300, 
    edgecolors='black',
    linewidth=2,
    label='Centroids'
)

plt.colorbar(scatter, label='Cluster')
plt.xlabel(f'PCA Component 1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
plt.ylabel(f'PCA Component 2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
plt.title('Text Clustering Visualization (PCA 2D)', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('clustering_pca_2d.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Interactive 2D visualization with Plotly
fig = px.scatter(
    df, 
    x='pca_1', 
    y='pca_2', 
    color='cluster',
    hover_data=['rating', 'review'],
    title='Interactive Text Clustering Visualization (PCA 2D)',
    labels={
        'pca_1': f'PCA 1 ({pca_2d.explained_variance_ratio_[0]:.2%})',
        'pca_2': f'PCA 2 ({pca_2d.explained_variance_ratio_[1]:.2%})',
        'cluster': 'Cluster'
    },
    color_continuous_scale='viridis',
    width=1000,
    height=700
)

# Add centroids
fig.add_trace(go.Scatter(
    x=centers_2d[:, 0],
    y=centers_2d[:, 1],
    mode='markers',
    marker=dict(size=15, color='red', symbol='x', line=dict(width=2, color='black')),
    name='Centroids',
    showlegend=True
))

fig.update_layout(
    font=dict(size=12),
    title_font=dict(size=18, family='Arial, bold')
)

fig.write_html('clustering_interactive_2d.html')
fig.show()

print("Interactive plot saved as 'clustering_interactive_2d.html'")


In [None]:
# 3D visualization with PCA
print("Applying PCA for 3D visualization...")
pca_3d = PCA(n_components=3, random_state=42)
pca_3d_result = pca_3d.fit_transform(embeddings)

df['pca_3d_1'] = pca_3d_result[:, 0]
df['pca_3d_2'] = pca_3d_result[:, 1]
df['pca_3d_3'] = pca_3d_result[:, 2]

print(f"3D Explained variance ratio: {pca_3d.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca_3d.explained_variance_ratio_):.4f}")


In [None]:
# Interactive 3D visualization
fig = px.scatter_3d(
    df,
    x='pca_3d_1',
    y='pca_3d_2',
    z='pca_3d_3',
    color='cluster',
    hover_data=['rating', 'review'],
    title='Interactive 3D Text Clustering Visualization',
    labels={
        'pca_3d_1': f'PCA 1 ({pca_3d.explained_variance_ratio_[0]:.2%})',
        'pca_3d_2': f'PCA 2 ({pca_3d.explained_variance_ratio_[1]:.2%})',
        'pca_3d_3': f'PCA 3 ({pca_3d.explained_variance_ratio_[2]:.2%})',
        'cluster': 'Cluster'
    },
    color_continuous_scale='viridis',
    width=1200,
    height=800
)

# Add centroids in 3D
centers_3d = pca_3d.transform(kmeans.cluster_centers_)
fig.add_trace(go.Scatter3d(
    x=centers_3d[:, 0],
    y=centers_3d[:, 1],
    z=centers_3d[:, 2],
    mode='markers',
    marker=dict(size=10, color='red', symbol='x', line=dict(width=2, color='black')),
    name='Centroids',
    showlegend=True
))

fig.update_layout(
    font=dict(size=12),
    title_font=dict(size=18, family='Arial, bold')
)

fig.write_html('clustering_interactive_3d.html')
fig.show()

print("Interactive 3D plot saved as 'clustering_interactive_3d.html'")


## 9. Silhouette Score Analysis


In [None]:
# Calculate overall silhouette score
overall_silhouette_score = silhouette_score(embeddings, df['cluster'])
print(f"Overall Silhouette Score: {overall_silhouette_score:.4f}")
print("\nInterpretation:")
if overall_silhouette_score > 0.7:
    print("Excellent clustering quality")
elif overall_silhouette_score > 0.5:
    print("Good clustering quality")
elif overall_silhouette_score > 0.3:
    print("Moderate clustering quality")
else:
    print("Poor clustering quality - clusters may be overlapping")


In [None]:
# Calculate silhouette scores for each sample
sample_silhouette_values = silhouette_samples(embeddings, df['cluster'])
df['silhouette_score'] = sample_silhouette_values

# Silhouette score by cluster
print("\nAverage Silhouette Score per Cluster:")
for i in range(n_clusters):
    cluster_silhouette_values = sample_silhouette_values[df['cluster'] == i]
    avg_score = cluster_silhouette_values.mean()
    print(f"Cluster {i}: {avg_score:.4f}")


In [None]:
# Silhouette plot
fig, ax = plt.subplots(figsize=(12, 8))

y_lower = 10
colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))

for i in range(n_clusters):
    # Aggregate silhouette scores for samples in cluster i
    ith_cluster_silhouette_values = sample_silhouette_values[df['cluster'] == i]
    ith_cluster_silhouette_values.sort()
    
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    ax.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=colors[i],
        edgecolor=colors[i],
        alpha=0.7
    )
    
    # Label the silhouette plots with their cluster numbers
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=12, fontweight='bold')
    
    y_lower = y_upper + 10

ax.set_xlabel('Silhouette Coefficient Values', fontsize=12)
ax.set_ylabel('Cluster Label', fontsize=12)
ax.set_title('Silhouette Plot for Each Cluster', fontsize=16, fontweight='bold')

# Add vertical line for average silhouette score
ax.axvline(x=overall_silhouette_score, color='red', linestyle='--', linewidth=2, label=f'Average: {overall_silhouette_score:.4f}')
ax.legend()

ax.set_yticks([])
ax.set_xlim([-0.1, 1])
plt.tight_layout()
plt.savefig('silhouette_plot.png', dpi=300, bbox_inches='tight')
plt.show()

print("Silhouette plot saved as 'silhouette_plot.png'")


## 10. Summary and Export Results


In [None]:
# Create summary report
summary = {
    'Total Reviews': len(df),
    'Number of Clusters': n_clusters,
    'Overall Silhouette Score': overall_silhouette_score,
    'Embedding Model': 'sentence-transformers/all-mpnet-base-v2',
    'Embedding Dimension': embeddings.shape[1],
    'PCA Variance Explained (2D)': sum(pca_2d.explained_variance_ratio_),
    'PCA Variance Explained (3D)': sum(pca_3d.explained_variance_ratio_)
}

print("\n" + "="*80)
print("CLUSTERING ANALYSIS SUMMARY")
print("="*80)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key:.<40} {value:.4f}")
    else:
        print(f"{key:.<40} {value}")
print("="*80)


In [None]:
# Export results to CSV
output_df = df[['date', 'review', 'rating', 'userName', 'cleaned_review', 'cluster', 'silhouette_score', 'pca_1', 'pca_2']].copy()
output_df.to_csv('clustering_results.csv', index=False)
print("\nResults exported to 'clustering_results.csv'")

# Export cluster keywords
keywords_df = pd.DataFrame([
    {
        'Cluster': cluster_id, 
        'Count': len(df[df['cluster'] == cluster_id]),
        'Top_Keywords': ', '.join(keywords[:10])
    }
    for cluster_id, keywords in top_keywords.items()
])
keywords_df.to_csv('cluster_keywords.csv', index=False)
print("Cluster keywords exported to 'cluster_keywords.csv'")


In [None]:
# List all generated files
print("\n" + "="*80)
print("GENERATED FILES")
print("="*80)
generated_files = [
    'wordcloud.png',
    'top_words.png',
    'elbow_silhouette.png',
    'cluster_statistics.png',
    'clustering_pca_2d.png',
    'clustering_interactive_2d.html',
    'clustering_interactive_3d.html',
    'silhouette_plot.png',
    'clustering_results.csv',
    'cluster_keywords.csv'
]

for idx, file in enumerate(generated_files, 1):
    print(f"{idx:2d}. {file}")
print("="*80)
print("\n✅ Analysis complete! All results have been saved.")
