# Pipeline API - Data Exploration & Research

This notebook demonstrates how to use Zipline's Pipeline API for:
- Querying and exploring stock data
- Building custom factors
- Filtering and screening stocks
- Analyzing factor distributions

Pipeline is perfect for research and factor development before building a strategy.

In [None]:
# Register Sharadar bundle (required for Jupyter notebooks)
from zipline.data.bundles import register
from zipline.data.bundles.sharadar_bundle import sharadar_bundle

register('sharadar', sharadar_bundle(tickers=None, incremental=True, include_funds=True))
print("✓ Sharadar bundle registered")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from zipline.pipeline import Pipeline, CustomFactor
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.factors import (
    Returns,
    SimpleMovingAverage,
    AverageDollarVolume,
    RSI,
)
from zipline.pipeline.loaders import USEquityPricingLoader
from zipline.data.bundles import load

# Set plot style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("✓ Imports complete")

## Setup Pipeline Environment

To run Pipeline standalone (without a backtest), we need to set up the data bundle.

In [None]:
from zipline.utils.calendar_utils import get_calendar
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.loaders import USEquityPricingLoader
from zipline.data.data_portal import DataPortal

# Load the bundle
bundle_data = load('sharadar')

# Get trading calendar
trading_calendar = get_calendar('XNYS')

# Create pricing loader
pricing_loader = USEquityPricingLoader.without_fx(
    bundle_data.equity_daily_bar_reader,
    bundle_data.adjustment_reader,
)

# Create pipeline engine
def make_engine():
    return SimplePipelineEngine(
        get_loader=lambda column: pricing_loader,
        asset_finder=bundle_data.asset_finder,
    )

engine = make_engine()

print("✓ Pipeline engine initialized")
print(f"  Available date range: {bundle_data.equity_daily_bar_reader.first_trading_day} to {bundle_data.equity_daily_bar_reader.last_available_dt}")

## Example 1: Basic Pipeline - Query Stock Data

Let's create a simple pipeline to get price and volume data.

In [None]:
# Define date range for research
start_date = pd.Timestamp('2023-01-01')
end_date = pd.Timestamp('2023-12-31')

# Create a simple pipeline
def make_basic_pipeline():
    # Get closing price and volume
    close = USEquityPricing.close.latest
    volume = USEquityPricing.volume.latest
    
    # Calculate dollar volume (liquidity measure)
    dollar_volume = AverageDollarVolume(window_length=30)
    
    # Filter for liquid stocks (top 500 by dollar volume)
    liquid = dollar_volume.top(500)
    
    return Pipeline(
        columns={
            'close': close,
            'volume': volume,
            'dollar_volume': dollar_volume,
        },
        screen=liquid
    )

# Run the pipeline for a single day
pipeline = make_basic_pipeline()
result = engine.run_pipeline(pipeline, start_date, start_date)

print(f"✓ Pipeline returned {len(result)} stocks")
print("\nSample data:")
result.head(10)

## Example 2: Custom Factors - Momentum and Volatility

Create custom factors for momentum and volatility analysis.

In [None]:
class Momentum(CustomFactor):
    """
    Price momentum: % return over lookback period.
    """
    inputs = [USEquityPricing.close]
    window_length = 60
    
    def compute(self, today, assets, out, close):
        out[:] = (close[-1] - close[0]) / close[0]


class Volatility(CustomFactor):
    """
    Price volatility: standard deviation of returns.
    """
    inputs = [USEquityPricing.close]
    window_length = 20
    
    def compute(self, today, assets, out, close):
        returns = np.diff(close, axis=0) / close[:-1]
        out[:] = np.std(returns, axis=0)


class MeanReversion(CustomFactor):
    """
    Mean reversion: % deviation from moving average.
    """
    inputs = [USEquityPricing.close]
    window_length = 20
    
    def compute(self, today, assets, out, close):
        ma = np.mean(close, axis=0)
        out[:] = (close[-1] - ma) / ma


print("✓ Custom factors defined")

In [None]:
# Create pipeline with custom factors
def make_research_pipeline():
    # Calculate factors
    momentum = Momentum()
    volatility = Volatility()
    mean_reversion = MeanReversion()
    dollar_volume = AverageDollarVolume(window_length=30)
    rsi = RSI()
    
    # Universe: top 500 liquid stocks
    universe = dollar_volume.top(500)
    
    # Normalize factors to z-scores
    momentum_z = momentum.zscore(mask=universe)
    volatility_z = volatility.zscore(mask=universe)
    
    return Pipeline(
        columns={
            'close': USEquityPricing.close.latest,
            'momentum': momentum,
            'volatility': volatility,
            'mean_reversion': mean_reversion,
            'rsi': rsi,
            'momentum_z': momentum_z,
            'volatility_z': volatility_z,
            'dollar_volume': dollar_volume,
        },
        screen=universe
    )

# Run pipeline over a period
pipeline = make_research_pipeline()
results = engine.run_pipeline(pipeline, start_date, end_date)

print(f"✓ Pipeline returned {len(results)} stock-date observations")
print(f"  Date range: {results.index.get_level_values(0).min()} to {results.index.get_level_values(0).max()}")
print(f"  Unique stocks: {results.index.get_level_values(1).nunique()}")
print("\nFactor statistics:")
results[['momentum', 'volatility', 'mean_reversion', 'rsi']].describe()

## Example 3: Factor Distribution Analysis

Analyze the distribution of factors across the universe.

In [None]:
# Get latest date data
latest_date = results.index.get_level_values(0).max()
latest_data = results.loc[latest_date]

# Plot factor distributions
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Momentum distribution
axes[0, 0].hist(latest_data['momentum'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Momentum Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('60-day Return')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(latest_data['momentum'].median(), color='red', linestyle='--', label='Median')
axes[0, 0].legend()

# Volatility distribution
axes[0, 1].hist(latest_data['volatility'].dropna(), bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Volatility Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('20-day Std Dev')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(latest_data['volatility'].median(), color='red', linestyle='--', label='Median')
axes[0, 1].legend()

# RSI distribution
axes[1, 0].hist(latest_data['rsi'].dropna(), bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_title('RSI Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('RSI')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(30, color='red', linestyle='--', label='Oversold')
axes[1, 0].axvline(70, color='red', linestyle='--', label='Overbought')
axes[1, 0].legend()

# Momentum vs Volatility scatter
scatter_data = latest_data[['momentum', 'volatility']].dropna()
axes[1, 1].scatter(scatter_data['volatility'], scatter_data['momentum'], alpha=0.5, s=10)
axes[1, 1].set_title('Momentum vs Volatility', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Volatility')
axes[1, 1].set_ylabel('Momentum')
axes[1, 1].axhline(0, color='black', linestyle='-', alpha=0.3)

plt.tight_layout()
plt.show()

## Example 4: Factor Correlation Analysis

Understand how factors relate to each other.

In [None]:
# Calculate factor correlations
factor_cols = ['momentum', 'volatility', 'mean_reversion', 'rsi']
correlations = latest_data[factor_cols].corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    correlations,
    annot=True,
    fmt='.3f',
    cmap='RdYlGn',
    center=0,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=1
)
plt.title('Factor Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nFactor Correlations:")
print(correlations)

## Example 5: Quintile Analysis

Divide stocks into quintiles based on a factor and analyze characteristics.

In [None]:
# Divide into quintiles based on momentum
latest_data['momentum_quintile'] = pd.qcut(
    latest_data['momentum'],
    q=5,
    labels=['Q1 (Low)', 'Q2', 'Q3', 'Q4', 'Q5 (High)'],
    duplicates='drop'
)

# Analyze characteristics by quintile
quintile_analysis = latest_data.groupby('momentum_quintile')[
    ['momentum', 'volatility', 'rsi', 'dollar_volume']
].agg(['mean', 'median', 'count'])

print("\nMomentum Quintile Analysis:")
print("="*80)
print(quintile_analysis)

# Visualize quintiles
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Average momentum by quintile
quintile_analysis['momentum']['mean'].plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Average Momentum by Quintile', fontsize=12, fontweight='bold')
axes[0].set_ylabel('60-day Return')
axes[0].axhline(0, color='black', linestyle='-', alpha=0.3)

# Average volatility by quintile
quintile_analysis['volatility']['mean'].plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Average Volatility by Momentum Quintile', fontsize=12, fontweight='bold')
axes[1].set_ylabel('20-day Std Dev')

# Stock count by quintile
quintile_analysis['momentum']['count'].plot(kind='bar', ax=axes[2], color='lightgreen')
axes[2].set_title('Stock Count by Quintile', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Number of Stocks')

plt.tight_layout()
plt.show()

## Example 6: Stock Screening

Use Pipeline to screen for specific stock characteristics.

In [None]:
# Screen for stocks with:
# - High momentum (top 20%)
# - Low volatility (bottom 50%)
# - RSI not overbought (< 70)

high_momentum = latest_data['momentum'] > latest_data['momentum'].quantile(0.80)
low_volatility = latest_data['volatility'] < latest_data['volatility'].quantile(0.50)
not_overbought = latest_data['rsi'] < 70

screened_stocks = latest_data[high_momentum & low_volatility & not_overbought]

print(f"\n✓ Screen found {len(screened_stocks)} stocks")
print("\nTop 10 Screened Stocks:")
print("="*80)

# Show top stocks sorted by momentum
top_stocks = screened_stocks.nlargest(10, 'momentum')
display_cols = ['close', 'momentum', 'volatility', 'rsi', 'dollar_volume']
print(top_stocks[display_cols].to_string())

# Get asset symbols
print("\nStock Symbols:")
for asset in top_stocks.index:
    symbol = bundle_data.asset_finder.retrieve_asset(asset.sid).symbol
    momentum = top_stocks.loc[asset, 'momentum']
    volatility = top_stocks.loc[asset, 'volatility']
    print(f"  {symbol:<10} Momentum: {momentum:>7.2%}  Volatility: {volatility:>6.4f}")

## Example 7: Time Series Analysis

Track factor values over time for a specific stock.

In [None]:
# Pick a stock to analyze (e.g., AAPL)
# Get asset ID for AAPL
aapl = bundle_data.asset_finder.lookup_symbol('AAPL', as_of_date=end_date)

# Extract time series for AAPL
aapl_data = results.xs(aapl, level=1)

# Plot factor evolution
fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)

# Momentum over time
axes[0].plot(aapl_data.index, aapl_data['momentum'], linewidth=2, color='steelblue')
axes[0].axhline(0, color='black', linestyle='--', alpha=0.3)
axes[0].fill_between(aapl_data.index, 0, aapl_data['momentum'], 
                      where=aapl_data['momentum']>=0, alpha=0.3, color='green', label='Positive')
axes[0].fill_between(aapl_data.index, 0, aapl_data['momentum'], 
                      where=aapl_data['momentum']<0, alpha=0.3, color='red', label='Negative')
axes[0].set_ylabel('Momentum')
axes[0].set_title('AAPL Factor Analysis (2023)', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Volatility over time
axes[1].plot(aapl_data.index, aapl_data['volatility'], linewidth=2, color='coral')
axes[1].set_ylabel('Volatility')
axes[1].grid(True, alpha=0.3)

# RSI over time
axes[2].plot(aapl_data.index, aapl_data['rsi'], linewidth=2, color='purple')
axes[2].axhline(70, color='red', linestyle='--', alpha=0.5, label='Overbought')
axes[2].axhline(30, color='green', linestyle='--', alpha=0.5, label='Oversold')
axes[2].set_ylabel('RSI')
axes[2].set_xlabel('Date')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:

1. **Pipeline Setup** - Running pipelines standalone for research
2. **Custom Factors** - Building momentum, volatility, and mean reversion factors
3. **Distribution Analysis** - Understanding factor characteristics
4. **Correlation Analysis** - Finding relationships between factors
5. **Quintile Analysis** - Analyzing factor performance across buckets
6. **Stock Screening** - Finding stocks that meet specific criteria
7. **Time Series Analysis** - Tracking factors over time

## Next Steps

- See `08_alphalens_factor_analysis.ipynb` for factor performance analysis
- See `09_multi_factor_research.ipynb` for combining multiple factors
- Use these techniques to develop and test your own factors
- Integrate successful factors into trading strategies