# SPY Minute Bar Data — Exploration

First look at the data: what do we have, what does intraday activity look like,
and which days should we focus on for deeper analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')

from src.data_collection import load_bars, clean_bars, add_derived_columns, get_daily_summary

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 5)
plt.rcParams['figure.dpi'] = 100

## Load and clean the data

In [None]:
bars = load_bars()
bars = clean_bars(bars)
bars = add_derived_columns(bars)

print(f"\nShape: {bars.shape}")
print(f"Columns: {list(bars.columns)}")
bars.head(10)

In [None]:
bars.describe()

## Daily summary

Get a feel for how days differ — volume, range, returns.

In [None]:
summary = get_daily_summary(bars)
summary.head(10)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# close price over time
axes[0].plot(summary['date'], summary['close_price'], color='steelblue', linewidth=1.5)
axes[0].set_ylabel('Close Price ($)')
axes[0].set_title('SPY Daily Overview — Jan to Apr 2020')

# daily volume
axes[1].bar(summary['date'], summary['total_volume'] / 1e6, color='gray', alpha=0.7)
axes[1].set_ylabel('Volume (M shares)')

# daily range
axes[2].bar(summary['date'], summary['day_range_pct'], color='indianred', alpha=0.7)
axes[2].set_ylabel('Day Range (%)')
axes[2].set_xlabel('Date')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Intraday volume profile

Classic U-shape pattern — high volume at open and close, quiet midday.
Let's see how this holds up across different regimes.

In [None]:
# split into pre-crash (Jan) and crash (late Feb - Mar) periods
bars['period'] = 'normal'
bars.loc[bars['date'] >= pd.Timestamp('2020-02-24').date(), 'period'] = 'crisis'

# average volume by minute of day
bars['minutes_since_open'] = (
    bars['datetime'].dt.hour * 60 + bars['datetime'].dt.minute - 570  # 570 = 9:30 in minutes
)

vol_profile = bars.groupby(['period', 'minutes_since_open'])['volume'].mean().reset_index()

fig, ax = plt.subplots(figsize=(14, 5))
for period, group in vol_profile.groupby('period'):
    color = 'indianred' if period == 'crisis' else 'steelblue'
    ax.plot(group['minutes_since_open'], group['volume'] / 1e3, 
            label=period.capitalize(), color=color, alpha=0.8)

ax.set_xlabel('Minutes Since Open')
ax.set_ylabel('Avg Volume (K shares)')
ax.set_title('Intraday Volume Profile: Normal vs Crisis')
ax.legend()
plt.tight_layout()
plt.show()

## Intraday volatility profile

Similar to volume — volatility (measured by bar range) should be higher
at open/close and spike during the crash period.

In [None]:
vol_profile_range = bars.groupby(['period', 'minutes_since_open'])['bar_range_pct'].mean().reset_index()

fig, ax = plt.subplots(figsize=(14, 5))
for period, group in vol_profile_range.groupby('period'):
    color = 'indianred' if period == 'crisis' else 'steelblue'
    ax.plot(group['minutes_since_open'], group['bar_range_pct'], 
            label=period.capitalize(), color=color, alpha=0.8)

ax.set_xlabel('Minutes Since Open')
ax.set_ylabel('Avg Bar Range (%)')
ax.set_title('Intraday Volatility Profile: Normal vs Crisis')
ax.legend()
plt.tight_layout()
plt.show()

## Return distribution

Compare the distribution of 1-minute returns across regimes.
Expect fatter tails during the crash.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for i, period in enumerate(['normal', 'crisis']):
    subset = bars[bars['period'] == period]['returns'].dropna()
    color = 'steelblue' if period == 'normal' else 'indianred'
    
    axes[i].hist(subset, bins=200, color=color, alpha=0.7, density=True)
    axes[i].set_title(f'{period.capitalize()} Period')
    axes[i].set_xlabel('1-Min Return')
    axes[i].set_ylabel('Density')
    axes[i].set_xlim(-0.005, 0.005)
    
    # stats
    stats_text = f'Mean: {subset.mean()*100:.4f}%\nStd: {subset.std()*100:.4f}%\nKurtosis: {subset.kurtosis():.1f}'
    axes[i].text(0.95, 0.95, stats_text, transform=axes[i].transAxes,
                verticalalignment='top', horizontalalignment='right',
                fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.suptitle('1-Minute Return Distributions', y=1.02)
plt.tight_layout()
plt.show()

## Single-day deep dives

Look at a few individual days to understand intraday dynamics.
These are candidates for deeper microstructure analysis.

In [None]:
def plot_single_day(bars, date, title=None):
    """Plot price and volume for a single trading day."""
    day = bars[bars['date'] == date].copy()
    if len(day) == 0:
        print(f"No data for {date}")
        return
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 7), sharex=True,
                                    gridspec_kw={'height_ratios': [2, 1]})
    
    if title is None:
        ret = (day['close'].iloc[-1] / day['open'].iloc[0] - 1) * 100
        title = f'SPY — {date} (Return: {ret:+.2f}%)'
    
    # price
    ax1.plot(day['datetime'], day['close'], color='steelblue', linewidth=1)
    ax1.fill_between(day['datetime'], day['low'], day['high'], alpha=0.15, color='steelblue')
    ax1.set_ylabel('Price ($)')
    ax1.set_title(title)
    
    # volume
    ax2.bar(day['datetime'], day['volume'] / 1e3, width=0.0005, color='gray', alpha=0.6)
    ax2.set_ylabel('Volume (K)')
    ax2.set_xlabel('Time')
    
    plt.tight_layout()
    plt.show()
    
    # print some stats
    print(f"  Total volume: {day['volume'].sum():,.0f}")
    print(f"  Range: ${day['low'].min():.2f} - ${day['high'].max():.2f} ({(day['high'].max()-day['low'].min())/day['open'].iloc[0]*100:.2f}%)")
    print(f"  Avg bar volume: {day['volume'].mean():,.0f}")
    print(f"  Median bar range: {day['bar_range_pct'].median():.4f}%")

In [None]:
import datetime

# quiet day
plot_single_day(bars, datetime.date(2020, 1, 16), 'Quiet Day — Jan 16')

In [None]:
# moderate volatility
plot_single_day(bars, datetime.date(2020, 1, 24), 'Moderate Vol — Jan 24')

In [None]:
# early crisis
plot_single_day(bars, datetime.date(2020, 2, 27), 'Early Crisis — Feb 27')

In [None]:
# peak chaos
plot_single_day(bars, datetime.date(2020, 3, 12), 'Peak Chaos — Mar 12')

In [None]:
# recovery bounce
plot_single_day(bars, datetime.date(2020, 3, 24), 'Recovery Bounce — Mar 24')

## Volume-return relationship

Do big moves come with big volume? (Spoiler: yes, but it's worth seeing.)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

colors = {'normal': 'steelblue', 'crisis': 'indianred'}
for period in ['normal', 'crisis']:
    mask = summary['date'].isin(
        bars[bars['period'] == period]['date'].unique()
    )
    subset = summary[mask]
    ax.scatter(subset['total_volume'] / 1e6, subset['day_range_pct'],
              color=colors[period], alpha=0.6, label=period.capitalize(), s=50)

ax.set_xlabel('Daily Volume (M shares)')
ax.set_ylabel('Daily Range (%)')
ax.set_title('Volume vs Range — Each Point is a Trading Day')
ax.legend()
plt.tight_layout()
plt.show()

## Selected analysis days

Based on the exploration, these 5 days give good coverage across regimes:

| Day | Date | Character | Why |
|-----|------|-----------|-----|
| 1 | Jan 16 | Quiet | Low vol, tight range — baseline for "normal" microstructure |
| 2 | Jan 24 | Moderate | Slightly elevated vol, decent range — typical active day |
| 3 | Feb 27 | Early crisis | First big selloff, 4.6% range — transition to panic |
| 4 | Mar 12 | Peak chaos | 7.4% range, 347M shares — extreme stress |
| 5 | Mar 24 | Recovery | Big bounce, 4.4% range — how does the book recover? |

In [None]:
import datetime

analysis_dates = [
    datetime.date(2020, 1, 16),
    datetime.date(2020, 1, 24),
    datetime.date(2020, 2, 27),
    datetime.date(2020, 3, 12),
    datetime.date(2020, 3, 24),
]

selected = summary[summary['date'].isin(analysis_dates)]
selected[['date', 'num_bars', 'total_volume', 'open_price', 'close_price', 
          'day_range_pct', 'return_pct', 'avg_bar_volume']].round(2)