# Dataset Analysis and Exploration

This notebook provides analysis and visualization of the SemEval sentiment analysis dataset.

In [None]:
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path('.').parent.resolve()
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load and Explore Dataset

In [10]:
# Load dataset using our custom reader
from src.data.dataset_reader import SemEvalDataset

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SemEvalDataset('../dataset/train', tokenizer)
test_dataset = SemEvalDataset('../dataset/test/SemEval2017-task4-test.subtask-A.english.txt', tokenizer)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

ModuleNotFoundError: No module named 'src'

In [None]:
# Extract data for analysis
train_texts = [item['text'] for item in train_dataset.data]
train_labels = [item['label'] for item in train_dataset.data]

test_texts = [item['text'] for item in test_dataset.data]
test_labels = [item['label'] for item in test_dataset.data]

class_names = ['negative', 'neutral', 'positive']

## Dataset Statistics

In [None]:
# Class distribution
train_counts = pd.Series(train_labels).value_counts().sort_index()
test_counts = pd.Series(test_labels).value_counts().sort_index()

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Training set distribution
axes[0].bar([class_names[i] for i in train_counts.index], train_counts.values, alpha=0.7)
axes[0].set_title('Training Set Class Distribution')
axes[0].set_ylabel('Count')
for i, v in enumerate(train_counts.values):
    axes[0].text(i, v + 50, str(v), ha='center', va='bottom')

# Test set distribution
axes[1].bar([class_names[i] for i in test_counts.index], test_counts.values, alpha=0.7)
axes[1].set_title('Test Set Class Distribution')
axes[1].set_ylabel('Count')
for i, v in enumerate(test_counts.values):
    axes[1].text(i, v + 20, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Text Length Analysis

In [None]:
# Analyze text lengths
train_lengths = [len(text.split()) for text in train_texts]
test_lengths = [len(text.split()) for text in test_texts]

# Token lengths (with BERT tokenizer)
train_token_lengths = [len(tokenizer.tokenize(text)) for text in train_texts[:1000]]  # Sample for speed
test_token_lengths = [len(tokenizer.tokenize(text)) for text in test_texts]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Word lengths
axes[0, 0].hist(train_lengths, bins=50, alpha=0.7, label='Train')
axes[0, 0].hist(test_lengths, bins=50, alpha=0.7, label='Test')
axes[0, 0].set_title('Distribution of Text Lengths (Words)')
axes[0, 0].set_xlabel('Number of Words')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Token lengths
axes[0, 1].hist(train_token_lengths, bins=50, alpha=0.7, label='Train (sample)')
axes[0, 1].hist(test_token_lengths, bins=50, alpha=0.7, label='Test')
axes[0, 1].set_title('Distribution of Token Lengths (BERT)')
axes[0, 1].set_xlabel('Number of Tokens')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Box plots
data_to_plot = [train_lengths, test_lengths]
axes[1, 0].boxplot(data_to_plot, labels=['Train', 'Test'])
axes[1, 0].set_title('Text Length Distribution (Words)')
axes[1, 0].set_ylabel('Number of Words')

data_to_plot_tokens = [train_token_lengths, test_token_lengths]
axes[1, 1].boxplot(data_to_plot_tokens, labels=['Train (sample)', 'Test'])
axes[1, 1].set_title('Token Length Distribution (BERT)')
axes[1, 1].set_ylabel('Number of Tokens')

plt.tight_layout()
plt.show()

print(f"Training text lengths - Mean: {np.mean(train_lengths):.1f}, Std: {np.std(train_lengths):.1f}")
print(f"Test text lengths - Mean: {np.mean(test_lengths):.1f}, Std: {np.std(test_lengths):.1f}")
print(f"Training token lengths - Mean: {np.mean(train_token_lengths):.1f}, Std: {np.std(train_token_lengths):.1f}")
print(f"Test token lengths - Mean: {np.mean(test_token_lengths):.1f}, Std: {np.std(test_token_lengths):.1f}")

## Sample Texts by Class

In [None]:
# Show sample texts for each class
for label_idx, class_name in enumerate(class_names):
    print(f"\n{'='*50}")
    print(f"Sample {class_name.upper()} texts:")
    print('='*50)
    
    # Get samples of this class
    class_samples = [text for text, label in zip(train_texts, train_labels) if label == label_idx]
    
    # Show first 3 samples
    for i, text in enumerate(class_samples[:3]):
        print(f"{i+1}. {text}")
        print()

## Tokenization Analysis

In [None]:
# Analyze how many texts would be truncated at different max lengths
max_lengths = [64, 128, 256, 512]
truncation_stats = {}

for max_len in max_lengths:
    train_truncated = sum(1 for length in train_token_lengths if length > max_len)
    test_truncated = sum(1 for length in test_token_lengths if length > max_len)
    
    truncation_stats[max_len] = {
        'train_truncated': train_truncated,
        'train_pct': (train_truncated / len(train_token_lengths)) * 100,
        'test_truncated': test_truncated,
        'test_pct': (test_truncated / len(test_token_lengths)) * 100
    }

# Display truncation statistics
print("Truncation Analysis:")
print("Max Length | Train Truncated (%) | Test Truncated (%)")
print("-" * 55)
for max_len, stats in truncation_stats.items():
    print(f"{max_len:^10} | {stats['train_pct']:^17.1f} | {stats['test_pct']:^16.1f}")