# 01 Data Sanity
Quick checks on healthcare ETF prices before building signals or backtests.

- **What is an ETF?** An exchange-traded fund tracks a basket of securities; here we use healthcare exposures (biotech, pharma, providers, equipment) plus SPY for context.
- **Why adjusted close?** Adjusted prices incorporate dividends and splits so total-return effects are captured and series stay comparable through corporate actions.
- **What does this sanity check look for?** Basic coverage and continuity: start/end dates, missing values per ticker, and quick visuals for obvious gaps or discontinuities.

In [None]:
import sys
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

# Ensure repo root is on sys.path for src imports
repo_root = Path.cwd().resolve()
if not (repo_root / 'src').exists():
    repo_root = repo_root.parent
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

from src.data.etf_loader import load_clean_prices

%matplotlib inline


In [None]:
prices = load_clean_prices()

first_date = prices.index.min().date()
last_date = prices.index.max().date()
print(f'First available date: {first_date}')
print(f'Last available date:  {last_date}')
print('\nRows and missing values per ticker:')
for ticker in prices.columns:
    n_rows = prices[ticker].shape[0]
    n_missing = int(prices[ticker].isna().sum())
    print(f'- {ticker}: rows={n_rows}, missing={n_missing}')


In [None]:
ax = prices.plot(figsize=(12, 6), logy=True, title='Healthcare ETF Prices (Adj Close, log scale)')
ax.set_ylabel('Price (log)')
plt.tight_layout()
plt.show()


In [None]:
ratio = prices['XBI'] / prices['XPH']
ax = ratio.plot(figsize=(12, 4), title='Biotech / Pharma Ratio (XBI / XPH)')
ax.set_ylabel('Ratio')
plt.tight_layout()
plt.show()
