In [None]:
# This would be a Jupyter notebook, but here's the Python code that would be in it:

"""
# Twitter Sentiment Analysis - Data Exploration Notebook

## 1. Import Libraries and Setup
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Setup plotting style
plt.style.use('default')
sns.set_palette("husl")

"""
## 2. Load Data
"""

df = pd.read_csv('../Twitter_Data.csv')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

"""
## 3. Basic Data Information
"""

print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nSentiment distribution:")
print(df['category'].value_counts())

"""
## 4. Data Quality Checks
"""

# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Check text length
df['text_length'] = df['clean_text'].str.len()
print("\nText length statistics:")
print(df['text_length'].describe())

"""
## 5. Sentiment Distribution Visualization
"""

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
sentiment_counts = df['category'].value_counts().sort_index()
sentiment_names = {-1: 'Negative', 0: 'Neutral', 1: 'Positive'}
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

axes[0].bar([sentiment_names[x] for x in sentiment_counts.index], 
           sentiment_counts.values, color=colors, alpha=0.8)
axes[0].set_title('Sentiment Distribution')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')

# Pie chart
axes[1].pie(sentiment_counts.values, labels=[sentiment_names[x] for x in sentiment_counts.index],
           autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Sentiment Proportions')

plt.tight_layout()
plt.show()

"""
## 6. Text Length Analysis
"""

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Box plot by sentiment
sns.boxplot(data=df, x='category', y='text_length', ax=axes[0])
axes[0].set_title('Text Length by Sentiment')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Text Length (characters)')

# Histogram
sns.histplot(data=df, x='text_length', hue='category', multiple="stack", ax=axes[1])
axes[1].set_title('Text Length Distribution')
axes[1].set_xlabel('Text Length')

plt.tight_layout()
plt.show()

"""
## 7. Word Frequency Analysis
"""

from collections import Counter
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Combine all text
all_text = ' '.join(df['clean_text'].astype(str))

# Tokenize and count words
words = re.findall(r'\b\w+\b', all_text.lower())
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words and len(word) > 2]

word_freq = Counter(filtered_words)
common_words = word_freq.most_common(20)

# Plot most common words
plt.figure(figsize=(12, 8))
words, counts = zip(*common_words)
plt.barh(words, counts, color='skyblue')
plt.title('Top 20 Most Frequent Words')
plt.xlabel('Frequency')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

"""
## 8. Sample Texts by Sentiment
"""

print("\nSample Negative Texts:")
print(df[df['category'] == -1]['clean_text'].head(3).tolist())
print("\nSample Neutral Texts:")
print(df[df['category'] == 0]['clean_text'].head(3).tolist())
print("\nSample Positive Texts:")
print(df[df['category'] == 1]['clean_text'].head(3).tolist())

"""
## 9. Save Exploration Results
"""

exploration_summary = {
    'total_samples': len(df),
    'negative_samples': len(df[df['category'] == -1]),
    'neutral_samples': len(df[df['category'] == 0]),
    'positive_samples': len(df[df['category'] == 1]),
    'avg_text_length': df['text_length'].mean(),
    'duplicate_count': df.duplicated().sum(),
    'missing_values': df.isnull().sum().to_dict()
}

print("\nExploration Summary:")
for key, value in exploration_summary.items():
    print(f"{key}: {value}")