In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

# Set up Vietnamese font (if available)
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")

In [None]:
# Load dataset
import sys
sys.path.append('..')

# Try to load from file first, otherwise create from script
try:
    df = pd.read_csv('../data/raw/initial_data.csv')
    print(f"Loaded dataset from file: {len(df)} samples")
except FileNotFoundError:
    print("File not found. Creating dataset from script...")
    from data.collect_data import create_initial_dataset, save_dataset_csv
    samples = create_initial_dataset()
    save_dataset_csv(samples, '../data/raw/initial_data.csv')
    df = pd.DataFrame(samples)
    print(f"Created dataset: {len(df)} samples")

# Display first few rows
print("\nDataset Preview:")
df.head(10)

In [None]:
# Basic statistics
print("=" * 50)
print("üìà DATASET OVERVIEW")
print("=" * 50)

print(f"\nüìä Total samples: {len(df)}")
print(f"üìù Columns: {list(df.columns)}")
print(f"\nüî¢ Data types:")
print(df.dtypes)

print(f"\n‚ùì Missing values:")
print(df.isnull().sum())

---
## 1. Emotion Distribution

In [None]:
# Emotion distribution
emotion_counts = df['primary_emotion'].value_counts()

print("Emotion Distribution:")
print(emotion_counts)
print(f"\nTotal unique emotions: {len(emotion_counts)}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
colors = plt.cm.Set3(np.linspace(0, 1, len(emotion_counts)))
emotion_counts.plot(kind='bar', ax=axes[0], color=colors, edgecolor='black')
axes[0].set_title('Emotion Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Emotion')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Add value labels
for i, v in enumerate(emotion_counts):
    axes[0].text(i, v + 0.5, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(emotion_counts, labels=emotion_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90, explode=[0.02]*len(emotion_counts))
axes[1].set_title('Emotion Distribution (Pie Chart)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../data/eda_emotion_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 2. Intensity Distribution

In [None]:
# Intensity distribution
intensity_counts = df['intensity'].value_counts().sort_index()

print("Intensity Distribution:")
print(intensity_counts)
print(f"\nMean intensity: {df['intensity'].mean():.2f}")
print(f"Median intensity: {df['intensity'].median():.2f}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
intensity_labels = ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong']
colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, 5))
intensity_counts.plot(kind='bar', ax=axes[0], color=colors, edgecolor='black')
axes[0].set_title('Intensity Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Intensity Level')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(intensity_labels, rotation=45)

# Add value labels
for i, v in enumerate(intensity_counts):
    axes[0].text(i, v + 0.5, str(v), ha='center', fontweight='bold')

# Box plot by emotion
df.boxplot(column='intensity', by='primary_emotion', ax=axes[1])
axes[1].set_title('Intensity by Emotion', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Emotion')
axes[1].set_ylabel('Intensity')
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.savefig('../data/eda_intensity_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 3. Text Length Analysis

In [None]:
# Add text length columns
df['text_length_chars'] = df['text'].apply(len)
df['text_length_words'] = df['text'].apply(lambda x: len(x.split()))

print("Text Length Statistics:")
print(f"\nCharacters:")
print(f"  Min: {df['text_length_chars'].min()}")
print(f"  Max: {df['text_length_chars'].max()}")
print(f"  Mean: {df['text_length_chars'].mean():.1f}")
print(f"  Median: {df['text_length_chars'].median():.1f}")

print(f"\nWords:")
print(f"  Min: {df['text_length_words'].min()}")
print(f"  Max: {df['text_length_words'].max()}")
print(f"  Mean: {df['text_length_words'].mean():.1f}")
print(f"  Median: {df['text_length_words'].median():.1f}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram - Characters
axes[0].hist(df['text_length_chars'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].axvline(df['text_length_chars'].mean(), color='red', linestyle='--', label=f"Mean: {df['text_length_chars'].mean():.1f}")
axes[0].set_title('Text Length Distribution (Characters)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Characters')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Histogram - Words
axes[1].hist(df['text_length_words'], bins=15, color='coral', edgecolor='black', alpha=0.7)
axes[1].axvline(df['text_length_words'].mean(), color='red', linestyle='--', label=f"Mean: {df['text_length_words'].mean():.1f}")
axes[1].set_title('Text Length Distribution (Words)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Words')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/eda_text_length.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 4. Emoji Analysis

In [None]:
# Collect all emojis
all_emojis = []
for col in ['emoji_1', 'emoji_2', 'emoji_3']:
    if col in df.columns:
        emojis = df[col].dropna().tolist()
        all_emojis.extend(emojis)

emoji_freq = Counter(all_emojis)

print(f"Total emoji occurrences: {len(all_emojis)}")
print(f"Unique emojis: {len(emoji_freq)}")
print(f"\nTop 20 Most Common Emojis:")
for emoji, count in emoji_freq.most_common(20):
    print(f"  {emoji}: {count}")

In [None]:
# Emoji visualization
top_20 = emoji_freq.most_common(20)
emojis = [e[0] for e in top_20]
counts = [e[1] for e in top_20]

fig, ax = plt.subplots(figsize=(14, 6))

colors = plt.cm.viridis(np.linspace(0, 0.8, len(emojis)))
bars = ax.barh(range(len(emojis)), counts, color=colors, edgecolor='black')

ax.set_yticks(range(len(emojis)))
ax.set_yticklabels(emojis, fontsize=16)
ax.invert_yaxis()  # Largest at top

ax.set_title('Top 20 Most Common Emojis', fontsize=16, fontweight='bold')
ax.set_xlabel('Frequency', fontsize=12)

# Add value labels
for i, v in enumerate(counts):
    ax.text(v + 0.5, i, str(v), va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../data/eda_emoji_frequency.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Emoji distribution by emotion
emoji_by_emotion = {}

for emotion in df['primary_emotion'].unique():
    emotion_df = df[df['primary_emotion'] == emotion]
    emojis = []
    for col in ['emoji_1', 'emoji_2', 'emoji_3']:
        if col in emotion_df.columns:
            emojis.extend(emotion_df[col].dropna().tolist())
    emoji_by_emotion[emotion] = Counter(emojis).most_common(5)

print("Top 5 Emojis by Emotion:")
print("=" * 50)
for emotion, top_emojis in sorted(emoji_by_emotion.items()):
    emoji_str = " ".join([f"{e}({c})" for e, c in top_emojis])
    print(f"{emotion.capitalize():15} ‚Üí {emoji_str}")

---
## 5. Emotion-Intensity Heatmap

In [None]:
# Create heatmap
pivot = pd.crosstab(df['primary_emotion'], df['intensity'])

fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(pivot, annot=True, fmt='d', cmap='YlOrRd', 
            linewidths=0.5, ax=ax, cbar_kws={'label': 'Count'})

ax.set_title('Emotion vs Intensity Distribution', fontsize=14, fontweight='bold')
ax.set_xlabel('Intensity Level', fontsize=12)
ax.set_ylabel('Emotion', fontsize=12)

# Set intensity labels
intensity_labels = ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong']
ax.set_xticklabels(intensity_labels, rotation=45, ha='right')

plt.tight_layout()
plt.savefig('../data/eda_emotion_intensity_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 6. Sample Texts by Emotion

In [None]:
# Show sample texts for each emotion
print("Sample Texts by Emotion:")
print("=" * 60)

for emotion in sorted(df['primary_emotion'].unique()):
    print(f"\nüè∑Ô∏è {emotion.upper()}")
    print("-" * 40)
    samples = df[df['primary_emotion'] == emotion].sample(min(3, len(df[df['primary_emotion'] == emotion])))
    for _, row in samples.iterrows():
        emojis = f"{row['emoji_1']} {row.get('emoji_2', '')} {row.get('emoji_3', '')}".strip()
        print(f"  \"{row['text']}\" ‚Üí {emojis} (intensity: {row['intensity']})")

---
## 7. Summary Statistics

In [None]:
# Generate summary report
print("=" * 60)
print("üìä EDA SUMMARY REPORT")
print("=" * 60)

print(f"\nüìÅ Dataset Size: {len(df)} samples")
print(f"\nüé≠ Emotions: {len(df['primary_emotion'].unique())} unique")
for emotion, count in emotion_counts.items():
    pct = count / len(df) * 100
    print(f"   - {emotion}: {count} ({pct:.1f}%)")

print(f"\nüìà Intensity:")
print(f"   - Mean: {df['intensity'].mean():.2f}")
print(f"   - Mode: {df['intensity'].mode().values[0]}")

print(f"\nüìù Text Length:")
print(f"   - Avg words: {df['text_length_words'].mean():.1f}")
print(f"   - Avg chars: {df['text_length_chars'].mean():.1f}")

print(f"\nüòä Emojis:")
print(f"   - Unique emojis: {len(emoji_freq)}")
print(f"   - Top 5: {' '.join([e for e, _ in emoji_freq.most_common(5)])}")

print(f"\n‚úÖ Dataset Quality:")
missing = df.isnull().sum().sum()
print(f"   - Missing values: {missing}")
print(f"   - Data complete: {'Yes ‚úì' if missing == 0 else 'No ‚úó'}")

In [None]:
# Save summary to file
summary = {
    'total_samples': len(df),
    'unique_emotions': len(df['primary_emotion'].unique()),
    'emotion_counts': dict(emotion_counts),
    'intensity_mean': df['intensity'].mean(),
    'avg_text_words': df['text_length_words'].mean(),
    'avg_text_chars': df['text_length_chars'].mean(),
    'unique_emojis': len(emoji_freq),
    'top_10_emojis': dict(emoji_freq.most_common(10)),
}

import json
with open('../data/eda_summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("\n‚úì Summary saved to data/eda_summary.json")
print("‚úì Plots saved to data/eda_*.png")

---
## üìã Key Insights

### Observations:
1. **Dataset Balance**: Check if emotions are evenly distributed
2. **Text Length**: Most texts are short (< 10 words) - typical for chat messages
3. **Intensity**: Strong emotions (4-5) are common
4. **Top Emojis**: üòä, üéâ, üò¢, üíî are most frequent

### Recommendations:
1. May need to augment underrepresented emotions
2. Short texts = need efficient preprocessing
3. Top 20 emojis cover ~80% of use cases - focus on these first

### Next Steps:
- [ ] Expand dataset to 300 samples
- [ ] Implement baseline models
- [ ] Calculate inter-rater agreement