# Review Sentiment and Rating Distribution Analysis

Analisis sentimen dan distribusi rating untuk memahami kepuasan pelanggan dan implikasi strategi bisnis.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style untuk visualisasi
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    try:
        plt.style.use('seaborn-darkgrid')
    except:
        plt.style.use('default')
sns.set_palette("husl")

# Load data
df = pd.read_csv('cleandata.csv')

print(f"Total records: {len(df)}")
print(f"\nData shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


In [None]:
# Cek data review_text dan rating
print("Review text unique values:")
print(df['review_text'].value_counts())
print("\n" + "="*50)
print("\nRating distribution:")
print(df['rating'].value_counts().sort_index())
print("\n" + "="*50)
print(f"\nMissing values in review_text: {df['review_text'].isna().sum()}")
print(f"Missing values in rating: {df['rating'].isna().sum()}")


In [None]:
# Fungsi untuk klasifikasi sentimen berdasarkan review_text dan rating
def classify_sentiment(row):
    """
    Klasifikasi sentimen berdasarkan review_text dan rating
    - Positive: rating >= 4 atau review_text mengandung 'good'/'very good'
    - Negative: rating <= 2 atau review_text mengandung 'bad'/'very bad'
    - Neutral: rating = 3 atau review_text = 'average'
    """
    review = str(row['review_text']).lower()
    rating = row['rating']
    
    # Prioritas: review_text lebih spesifik
    if 'very good' in review:
        return 'positive'
    elif 'very bad' in review:
        return 'negative'
    elif 'good' in review:
        return 'positive'
    elif 'bad' in review:
        return 'negative'
    elif 'average' in review:
        return 'neutral'
    else:
        # Fallback ke rating jika review_text tidak jelas
        if rating >= 4:
            return 'positive'
        elif rating <= 2:
            return 'negative'
        else:
            return 'neutral'

# Apply sentiment classification
df['sentiment'] = df.apply(classify_sentiment, axis=1)

# Tampilkan hasil
print("Sentiment Distribution:")
print(df['sentiment'].value_counts())
print("\n" + "="*50)
print("\nSentiment by Rating:")
print(pd.crosstab(df['rating'], df['sentiment'], margins=True))


In [None]:
# Visualisasi 1: Distribusi Rating
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot rating distribution
rating_counts = df['rating'].value_counts().sort_index()
axes[0].bar(rating_counts.index, rating_counts.values, color=['#d32f2f', '#f57c00', '#fbc02d', '#689f38', '#388e3c'])
axes[0].set_xlabel('Rating', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Rating Distribution', fontsize=14, fontweight='bold', pad=20)
axes[0].set_xticks(range(1, 6))
axes[0].grid(axis='y', alpha=0.3)

# Tambahkan nilai di atas bar
for i, v in enumerate(rating_counts.values):
    axes[0].text(rating_counts.index[i], v + 20, str(v), ha='center', fontweight='bold')

# Pie chart rating distribution
colors_rating = ['#d32f2f', '#f57c00', '#fbc02d', '#689f38', '#388e3c']
axes[1].pie(rating_counts.values, labels=[f'Rating {i}' for i in rating_counts.index], 
            autopct='%1.1f%%', startangle=90, colors=colors_rating, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Rating Distribution (Percentage)', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.show()

# Statistik rating
print("\n" + "="*50)
print("RATING STATISTICS")
print("="*50)
print(f"Mean Rating: {df['rating'].mean():.2f}")
print(f"Median Rating: {df['rating'].median():.2f}")
print(f"Mode Rating: {df['rating'].mode()[0]}")
print(f"\nRating Breakdown:")
for rating in sorted(df['rating'].unique()):
    count = len(df[df['rating'] == rating])
    percentage = (count / len(df)) * 100
    print(f"  Rating {rating}: {count:,} ({percentage:.2f}%)")


In [None]:
# Visualisasi 2: Distribusi Sentimen
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
sentiment_order = ['positive', 'neutral', 'negative']
sentiment_counts_ordered = sentiment_counts.reindex(sentiment_order)

colors_sentiment = {'positive': '#4caf50', 'neutral': '#ff9800', 'negative': '#f44336'}
bar_colors = [colors_sentiment[s] for s in sentiment_counts_ordered.index]

axes[0].bar(sentiment_counts_ordered.index, sentiment_counts_ordered.values, color=bar_colors)
axes[0].set_xlabel('Sentiment', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Sentiment Distribution', fontsize=14, fontweight='bold', pad=20)
axes[0].grid(axis='y', alpha=0.3)

# Tambahkan nilai di atas bar
for i, v in enumerate(sentiment_counts_ordered.values):
    axes[0].text(i, v + 20, str(v), ha='center', fontweight='bold')

# Pie chart sentiment distribution
axes[1].pie(sentiment_counts_ordered.values, labels=sentiment_counts_ordered.index.capitalize(), 
            autopct='%1.1f%%', startangle=90, colors=bar_colors, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Sentiment Distribution (Percentage)', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.show()

# Statistik sentimen
print("\n" + "="*50)
print("SENTIMENT STATISTICS")
print("="*50)
for sentiment in sentiment_order:
    count = len(df[df['sentiment'] == sentiment])
    percentage = (count / len(df)) * 100
    print(f"{sentiment.capitalize()}: {count:,} ({percentage:.2f}%)")


In [None]:
# Visualisasi 3: Hubungan Rating vs Sentimen
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stacked bar chart: Rating by Sentiment
crosstab = pd.crosstab(df['rating'], df['sentiment'])
crosstab_ordered = crosstab[['positive', 'neutral', 'negative']]

# Define colors untuk sentimen
colors_sentiment = {'positive': '#4caf50', 'neutral': '#ff9800', 'negative': '#f44336'}

crosstab_ordered.plot(kind='bar', stacked=True, ax=axes[0], 
                      color=[colors_sentiment['positive'], colors_sentiment['neutral'], colors_sentiment['negative']])
axes[0].set_xlabel('Rating', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')
axes[0].set_title('Rating Distribution by Sentiment (Stacked)', fontsize=14, fontweight='bold', pad=20)
axes[0].legend(title='Sentiment', fontsize=10)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Heatmap: Rating vs Sentiment
sns.heatmap(crosstab_ordered, annot=True, fmt='d', cmap='RdYlGn', ax=axes[1], 
            cbar_kws={'label': 'Count'}, linewidths=0.5, linecolor='gray')
axes[1].set_xlabel('Sentiment', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Rating', fontsize=12, fontweight='bold')
axes[1].set_title('Rating vs Sentiment Heatmap', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.show()

# Tampilkan crosstab
print("\n" + "="*50)
print("RATING vs SENTIMENT CROSSTAB")
print("="*50)
print(crosstab_ordered)
print("\nPercentage by Rating:")
print((crosstab_ordered.div(crosstab_ordered.sum(axis=1), axis=0) * 100).round(2))


In [None]:
# Visualisasi 4: Review Text Distribution
fig, ax = plt.subplots(figsize=(12, 6))

review_counts = df['review_text'].value_counts()
review_order = ['very good', 'good', 'average', 'bad', 'very bad']
review_counts_ordered = review_counts.reindex(review_order)

colors_review = {'very good': '#2e7d32', 'good': '#4caf50', 'average': '#ff9800', 
                 'bad': '#f57c00', 'very bad': '#d32f2f'}
bar_colors_review = [colors_review[r] for r in review_counts_ordered.index]

bars = ax.bar(review_counts_ordered.index, review_counts_ordered.values, color=bar_colors_review)
ax.set_xlabel('Review Text', fontsize=12, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax.set_title('Review Text Distribution', fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')

# Tambahkan nilai di atas bar
for i, v in enumerate(review_counts_ordered.values):
    ax.text(i, v + 20, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*50)
print("REVIEW TEXT DISTRIBUTION")
print("="*50)
for review in review_order:
    count = len(df[df['review_text'] == review])
    percentage = (count / len(df)) * 100
    print(f"{review.capitalize()}: {count:,} ({percentage:.2f}%)")


In [None]:
# Visualisasi 5: Analisis Sentimen berdasarkan Kategori Produk
fig, ax = plt.subplots(figsize=(14, 8))

# Sentiment by Category
category_sentiment = pd.crosstab(df['category'], df['sentiment'])
category_sentiment_ordered = category_sentiment[['positive', 'neutral', 'negative']]

category_sentiment_ordered.plot(kind='barh', stacked=True, ax=ax,
                                color=[colors_sentiment['positive'], colors_sentiment['neutral'], colors_sentiment['negative']])
ax.set_xlabel('Count', fontsize=12, fontweight='bold')
ax.set_ylabel('Category', fontsize=12, fontweight='bold')
ax.set_title('Sentiment Distribution by Product Category', fontsize=14, fontweight='bold', pad=20)
ax.legend(title='Sentiment', fontsize=10)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

# Statistik per kategori
print("\n" + "="*50)
print("SENTIMENT BY CATEGORY")
print("="*50)
for category in df['category'].unique():
    cat_df = df[df['category'] == category]
    print(f"\n{category}:")
    print(f"  Total Reviews: {len(cat_df)}")
    print(f"  Average Rating: {cat_df['rating'].mean():.2f}")
    for sentiment in ['positive', 'neutral', 'negative']:
        count = len(cat_df[cat_df['sentiment'] == sentiment])
        percentage = (count / len(cat_df)) * 100 if len(cat_df) > 0 else 0
        print(f"  {sentiment.capitalize()}: {count} ({percentage:.1f}%)")


## Analisis dan Implikasi Strategi Bisnis

### 1. Distribusi Rating
- **Insight**: Distribusi rating relatif seimbang dengan sedikit variasi
- **Implikasi**: 
  - Perlu fokus pada peningkatan rating 1-2 menjadi 3-4
  - Program customer satisfaction untuk meningkatkan rating rata-rata

### 2. Distribusi Sentimen
- **Insight**: Analisis sentimen menunjukkan proporsi positive, neutral, dan negative
- **Implikasi**:
  - Fokus pada konversi sentimen negative menjadi neutral/positive
  - Program retensi untuk pelanggan dengan sentimen positive
  - Investigasi akar masalah untuk sentimen negative

### 3. Hubungan Rating vs Sentimen
- **Insight**: Korelasi antara rating numerik dan sentimen tekstual
- **Implikasi**:
  - Validasi konsistensi antara rating dan review_text
  - Identifikasi kasus dimana rating tidak sesuai dengan sentimen

### 4. Review Text Analysis
- **Insight**: Distribusi review text menunjukkan pola preferensi pelanggan
- **Implikasi**:
  - Identifikasi produk/kategori dengan review "very bad" atau "bad"
  - Program improvement untuk produk dengan sentimen negatif tinggi

### 5. Sentimen per Kategori
- **Insight**: Variasi sentimen antar kategori produk
- **Implikasi**:
  - Fokus improvement pada kategori dengan sentimen negatif tinggi
  - Replikasi best practice dari kategori dengan sentimen positif tinggi
  - Strategi marketing berbeda per kategori berdasarkan sentimen


In [None]:
# Ringkasan Statistik dan Rekomendasi
print("="*70)
print("RINGKASAN ANALISIS SENTIMEN DAN RATING")
print("="*70)

print("\n1. OVERALL STATISTICS")
print("-"*70)
print(f"Total Reviews: {len(df):,}")
print(f"Average Rating: {df['rating'].mean():.2f} / 5.00")
print(f"Median Rating: {df['rating'].median():.2f} / 5.00")

print("\n2. SENTIMENT BREAKDOWN")
print("-"*70)
for sentiment in ['positive', 'neutral', 'negative']:
    count = len(df[df['sentiment'] == sentiment])
    percentage = (count / len(df)) * 100
    print(f"{sentiment.capitalize():10s}: {count:5,} ({percentage:5.2f}%)")

print("\n3. RATING BREAKDOWN")
print("-"*70)
for rating in sorted(df['rating'].unique()):
    count = len(df[df['rating'] == rating])
    percentage = (count / len(df)) * 100
    print(f"Rating {rating}: {count:5,} ({percentage:5.2f}%)")

print("\n4. KEY INSIGHTS")
print("-"*70)
positive_pct = (len(df[df['sentiment'] == 'positive']) / len(df)) * 100
negative_pct = (len(df[df['sentiment'] == 'negative']) / len(df)) * 100
neutral_pct = (len(df[df['sentiment'] == 'neutral']) / len(df)) * 100

print(f"• Sentimen Positif: {positive_pct:.2f}% - {'Baik' if positive_pct > 40 else 'Perlu Peningkatan'}")
print(f"• Sentimen Negatif: {negative_pct:.2f}% - {'Perlu Perhatian Serius' if negative_pct > 30 else 'Masih Dapat Diterima'}")
print(f"• Net Sentiment Score: {positive_pct - negative_pct:.2f}% - {'Positif' if (positive_pct - negative_pct) > 0 else 'Negatif'}")

# Kategori dengan sentimen terburuk
worst_category = df.groupby('category').apply(lambda x: (x['sentiment'] == 'negative').sum() / len(x)).idxmax()
worst_category_pct = (df[df['category'] == worst_category]['sentiment'] == 'negative').sum() / len(df[df['category'] == worst_category]) * 100
print(f"• Kategori dengan Sentimen Negatif Tertinggi: {worst_category} ({worst_category_pct:.2f}%)")

# Kategori dengan sentimen terbaik
best_category = df.groupby('category').apply(lambda x: (x['sentiment'] == 'positive').sum() / len(x)).idxmax()
best_category_pct = (df[df['category'] == best_category]['sentiment'] == 'positive').sum() / len(df[df['category'] == best_category]) * 100
print(f"• Kategori dengan Sentimen Positif Tertinggi: {best_category} ({best_category_pct:.2f}%)")

print("\n5. REKOMENDASI STRATEGI BISNIS")
print("-"*70)
print("1. Fokus pada peningkatan kualitas produk/kategori dengan sentimen negatif tinggi")
print("2. Implementasi program customer feedback untuk mengkonversi sentimen negatif")
print("3. Replikasi best practice dari kategori dengan sentimen positif tinggi")
print("4. Monitoring real-time sentimen untuk deteksi dini masalah")
print("5. Program loyalty untuk pelanggan dengan sentimen positif")
print("="*70)
