### EDA: Sentiment vs Market Returns

This notebook explores relationships between daily sentiment and equity returns.

- Loads processed features for a chosen ticker
- Visualizes price vs sentiment, sentiment vs next-day return
- Computes correlations, cross-correlations, and Granger causality
- Saves figures to `reports/figs/` and summaries to `reports/`



In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.tsa.stattools import ccf, grangercausalitytests

PROC_DIR = Path.cwd().parent / 'data' / 'processed'
REPORTS_DIR = Path.cwd().parent / 'reports'
FIGS_DIR = REPORTS_DIR / 'figs'
FIGS_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

ticker = 'AAPL'  # change here
path = PROC_DIR / f'features_{ticker}.parquet'
if path.exists():
    df = pd.read_parquet(path)
else:
    # synthetic fallback
    n = 120
    dates = pd.date_range('2024-01-01', periods=n, freq='B')
    rng = np.random.default_rng(42)
    sent = rng.normal(0, 0.1, size=n).cumsum() / 5
    returns = rng.normal(0, 0.01, size=n) + 0.2 * np.concatenate([[0], np.diff(sent)])
    prices = 100 * (1 + pd.Series(returns, index=dates)).cumprod()
    df = pd.DataFrame({
        'date': dates,
        'adj_close': prices,
        'daily_mean_sentiment': pd.Series(sent, index=dates).rolling(3, min_periods=1).mean(),
        'return': returns,
        'target_return_1d': pd.Series(returns, index=dates).shift(-1),
    })

# Ensure required columns
if 'target_return_1d' not in df.columns and 'return' in df.columns:
    df = df.sort_values('date').reset_index(drop=True)
    df['target_return_1d'] = df['return'].shift(-1)

df.head()


In [None]:
# Dual-axis plot
fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()
ax1.plot(df['date'], df['adj_close'], color='tab:blue', label='Adj Close')
ax2.plot(df['date'], df['daily_mean_sentiment'], color='tab:orange', alpha=0.7, label='Daily Sentiment')
ax1.set_xlabel('Date')
ax1.set_ylabel('Adj Close', color='tab:blue')
ax2.set_ylabel('Daily Sentiment', color='tab:orange')
ax1.set_title(f"{ticker}: Adjusted Close vs Daily Sentiment")
fig.tight_layout()

out = FIGS_DIR / f"{ticker}_dual_axis.png"
fig.savefig(out, dpi=150)
out


In [None]:
# Scatter with regression and Pearson stats
x = df['daily_mean_sentiment']
y = df['target_return_1d']
mask = x.notna() & y.notna()
r, p = (stats.pearsonr(x[mask], y[mask]) if mask.sum() > 2 else (np.nan, np.nan))

fig, ax = plt.subplots(figsize=(7, 5))
sns.regplot(x=x, y=y, ax=ax, scatter_kws={'alpha':0.6})
ax.set_xlabel('Daily Mean Sentiment')
ax.set_ylabel('Next-day Return')
ax.set_title(f"{ticker}: Sentiment vs Next-day Return\nPearson r={r:.3f}, p={p:.3g}")
fig.tight_layout()

out = FIGS_DIR / f"{ticker}_scatter_fit.png"
fig.savefig(out, dpi=150)

r, p, out


In [None]:
# Correlation heatmap
cols = ['adj_close','daily_mean_sentiment','return','target_return_1d']
cols = [c for c in cols if c in df.columns]
fig, ax = plt.subplots(figsize=(6,5))
sns.heatmap(df[cols].corr(), annot=True, fmt='.2f', cmap='coolwarm', ax=ax)
ax.set_title(f"{ticker}: Correlation Heatmap")
fig.tight_layout()

out = FIGS_DIR / f"{ticker}_corr_heatmap.png"
fig.savefig(out, dpi=150)
out


In [None]:
# Pearson & Spearman correlations
rows = []
for col in ['return','target_return_1d']:
    if col not in df.columns:
        continue
    x = df['daily_mean_sentiment']
    y = df[col]
    mask = x.notna() & y.notna()
    if mask.sum() > 2:
        r, p = stats.pearsonr(x[mask], y[mask])
        rho, ps = stats.spearmanr(x[mask], y[mask])
    else:
        r = p = rho = ps = np.nan
    rows.append({'metric': f'sentiment vs {col}', 'pearson_r': r, 'pearson_p': p, 'spearman_rho': rho, 'spearman_p': ps})

corr_df = pd.DataFrame(rows)
corr_df


In [None]:
# Cross-correlation function across lags -10..+10
x = df['daily_mean_sentiment'].fillna(0).values
y = df['return'].fillna(0).values if 'return' in df.columns else df['target_return_1d'].fillna(0).values
vals = ccf(x - x.mean(), y - y.mean(), adjusted=False)
max_lag = 10
lags = np.arange(len(vals))[: 2*max_lag + 1]
vals = vals[: 2*max_lag + 1]
lags = lags - max_lag
best_idx = int(np.nanargmax(np.abs(vals)))
best_lag = int(lags[best_idx])

fig, ax = plt.subplots(figsize=(8,4))
ax.bar(lags, vals, color='tab:purple')
ax.axvline(0, color='black', linewidth=1)
ax.set_xlabel('Lag (sentiment leads +)')
ax.set_ylabel('Cross-correlation')
ax.set_title(f'{ticker}: CCF (max at lag {best_lag})')
fig.tight_layout()

out = FIGS_DIR / f"{ticker}_ccf.png"
fig.savefig(out, dpi=150)

best_lag, out


In [None]:
# Granger causality tests (lags 1..5)
data = df[['return','daily_mean_sentiment']].dropna() if 'return' in df.columns else df[['target_return_1d','daily_mean_sentiment']].dropna()
res = grangercausalitytests(data, maxlag=5, verbose=False)
lines = ["Granger Causality: daily_mean_sentiment -> returns"]
for lag in range(1, 6):
    test = res[lag][0]
    lines.append(f"lag={lag}: p-values: " + ", ".join(f"{k}={v[1]:.3g}" for k,v in test.items()))

text_path = REPORTS_DIR / f'granger_{ticker}.txt'
with open(text_path, 'w', encoding='utf-8') as f:
    f.write("\n".join(lines) + "\n")

text_path


#### Interpretation notes

- Correlations: Report Pearson and Spearman; small |r| with high p-values suggests weak linear/monotonic relationships.
- Cross-correlation: Positive lag peak suggests sentiment leads returns by that many days; negative lag means returns lead sentiment.
- Granger causality: If p-values (ssr_ftest/ssr_chi2test) < 0.05 at some lags, we have evidence that sentiment helps predict returns at those horizons.

