# EDA: VADER Baseline on Sample Financial Texts

## Imports & Paths

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from src.preprocess.text_cleaner import clean_text
from src.models.sentiment import score_vader

RAW = Path("data/raw/sample_texts.csv")
OUT = Path("data/processed/sample_texts_scored.csv")

## Load & Clean

In [None]:
df = pd.read_csv(RAW, parse_dates=["timestamp"])
df["text_clean"] = df["text"].map(clean_text)
df.head()

## Score with VADER

In [None]:
scored = score_vader(df.rename(columns={"text_clean":"text"}), text_col="text")
scored.head()

## Basic aggregates

In [None]:
# sentiment by ticker
agg_ticker = scored.groupby("ticker")["compound"].mean().sort_values(ascending=False)
display(agg_ticker)

# resample by 15 minutes
scored = scored.sort_values("timestamp")
sent_time = (
    scored.set_index("timestamp")
          .groupby("ticker")["compound"]
          .resample("15min")
          .mean()
          .reset_index()
)
sent_time.head()

## Plots

In [None]:
# compound distribution
scored["compound"].plot(kind="hist", bins=20, title="Compound Score Distribution")
plt.xlabel("compound")
plt.show()

# time series by ticker
for t in sent_time["ticker"].unique():
    tmp = sent_time[sent_time["ticker"] == t]
    tmp.plot(x="timestamp", y="compound", title=f"{t} Sentiment over Time")
    plt.ylabel("compound")
    plt.show()

## Save Processed

In [None]:
scored.to_csv(OUT, index=False)
OUT