# Validate features.npy / prices.npy (Alpha Vantage sentiment)

This notebook performs quick sanity checks:

- file existence
- shapes & dtypes
- NaN/Inf checks
- basic distribution stats (incl. sentiment columns)
- per-ticker row counts from `build_debug.csv`


In [None]:
import os
import numpy as np
import pandas as pd

DATA_DIR = "data"  # adjust if your artifacts are elsewhere
FEATURES_PATH = os.path.join(DATA_DIR, "features.npy")
PRICES_PATH   = os.path.join(DATA_DIR, "prices.npy")
DEBUG_PATH    = os.path.join(DATA_DIR, "build_debug.csv")

print("FEATURES_PATH:", FEATURES_PATH, "exists:", os.path.exists(FEATURES_PATH))
print("PRICES_PATH  :", PRICES_PATH,   "exists:", os.path.exists(PRICES_PATH))
print("DEBUG_PATH   :", DEBUG_PATH,    "exists:", os.path.exists(DEBUG_PATH))


In [None]:
features = np.load(FEATURES_PATH)
prices   = np.load(PRICES_PATH)
dbg      = pd.read_csv(DEBUG_PATH)

print("features shape:", features.shape, "dtype:", features.dtype)
print("prices   shape:", prices.shape,   "dtype:", prices.dtype)
print("\n=== build_debug.csv ===")
display(dbg)


In [None]:
# Hard sanity checks
assert features.ndim == 2, "features.npy must be 2D (N, D)"
assert prices.ndim == 1, "prices.npy must be 1D (N,)"
assert features.shape[0] == prices.shape[0], "Row mismatch between features and prices"
assert np.isfinite(features).all(), "features contains NaN/Inf"
assert np.isfinite(prices).all(), "prices contains NaN/Inf"
print("✅ Basic shape and finite checks passed.")


In [None]:
# Feature-level stats
D = features.shape[1]
stats = pd.DataFrame({
    "col": list(range(D)),
    "min": features.min(axis=0),
    "p01": np.quantile(features, 0.01, axis=0),
    "p50": np.quantile(features, 0.50, axis=0),
    "p99": np.quantile(features, 0.99, axis=0),
    "max": features.max(axis=0),
    "mean": features.mean(axis=0),
    "std": features.std(axis=0),
})
display(stats)


## Sentiment columns

Assumption: you appended 2 Alpha Vantage features at the end:

- `sentiment` (weighted ticker sentiment)
- `sentiment_mass` (sum of relevance scores for that day)

So we interpret:
- `sentiment_col = D-2`
- `sentiment_mass_col = D-1`

If you inserted them elsewhere, just change indices below.


In [None]:
sentiment_col = D - 2
mass_col = D - 1

sent = features[:, sentiment_col]
mass = features[:, mass_col]

print("sentiment col index:", sentiment_col)
print("mass col index     :", mass_col)

print("\nSentiment stats:")
print(pd.Series(sent).describe())

print("\nMass stats:")
print(pd.Series(mass).describe())

# quick check: are they mostly zeros?
print("\n% zeros (sentiment):", (sent == 0).mean())
print("% zeros (mass)     :", (mass == 0).mean())


In [None]:
# Simple relationship checks (not a trading claim)
# Next-day return proxy from prices in the concatenated dataset:
# NOTE: since dataset is stacked across tickers, this return series crosses ticker boundaries.
# It's only a crude sanity check that values vary; don't use it for evaluation.

ret1 = np.empty_like(prices, dtype=np.float32)
ret1[:] = np.nan
ret1[1:] = (prices[1:] - prices[:-1]) / (prices[:-1] + 1e-9)

tmp = pd.DataFrame({
    "sentiment": sent,
    "mass": mass,
    "ret1": ret1,
}).dropna()

print("Corr(sentiment, ret1):", tmp["sentiment"].corr(tmp["ret1"]))
print("Corr(mass, ret1)     :", tmp["mass"].corr(tmp["ret1"]))


## Per-ticker row counts

We can infer expected per-ticker row counts from `build_debug.csv`.
This does not reconstruct ticker boundaries in `features.npy` (because it's concatenated),
but it gives you the totals you expect (e.g., 1490 rows per ticker × 5 = 7450).


In [None]:
dbg["rows_features"].sum(), features.shape[0]


In [None]:
print("Per-ticker rows:")
display(dbg[["ticker","rows_features","feature_dim","first_date","last_date"]])
