# Exploratory Data Analysis

This notebook provides exploratory analysis of the stock tweets dataset.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path().resolve().parent))

from src.utils.data_loader import load_data, prepare_data
from src.preprocessing.text_preprocessor import TextPreprocessor

## Load Data

In [None]:
# Load data
df = load_data('../data/stock_tweets.csv', sample_size=10000)
df = prepare_data(df)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## Basic Statistics

In [None]:
# Basic statistics
print(f"Total tweets: {len(df)}")
print(f"Unique stocks: {df['Stock Name'].nunique()}")
print(f"Unique companies: {df['Company Name'].nunique()}")
print(f"\nAverage tweet length: {df['Tweet'].str.len().mean():.0f} characters")
print(f"Average word count: {df['Tweet'].str.split().str.len().mean():.0f} words")

## Stock Distribution

In [None]:
# Stock distribution
stock_counts = df['Stock Name'].value_counts().head(20)

plt.figure(figsize=(12, 6))
stock_counts.plot(kind='bar')
plt.title('Top 20 Stocks by Tweet Count')
plt.xlabel('Stock')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Preprocessing Example

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(remove_stopwords=True, lemmatize=True)

# Example preprocessing
sample_text = df['Tweet'].iloc[0]
print("Original:")
print(sample_text)
print("\nPreprocessed:")
print(preprocessor.preprocess(sample_text))
print("\nTokens:")
print(preprocessor.preprocess(sample_text, return_tokens=True))

## Sentiment Analysis Example

In [None]:
from src.models.sentiment_analyzer import SentimentAnalyzer

# Initialize analyzer
analyzer = SentimentAnalyzer(use_bert=False)

# Analyze sample
sample_text = df['Tweet'].iloc[0]
results = analyzer.analyze(sample_text, models=['vader', 'textblob'])

print(f"Sample text: {sample_text}")
print("\nSentiment Analysis Results:")
for model, result in results.items():
    print(f"\n{model.upper()}:")
    print(f"  Label: {result.get('label', 'N/A')}")
    for key, value in result.items():
        if key != 'label' and isinstance(value, (int, float)):
            print(f"  {key}: {value:.4f}")