# Multi-Factor Research with Alphalens

This notebook demonstrates advanced factor research:
- Building and testing multiple factors simultaneously
- Creating composite factors from individual signals
- Comparing factor performance
- Building a multi-factor ranking system

This is how professional quant researchers develop systematic strategies.

In [None]:
# Register Sharadar bundle (required for Jupyter notebooks)
from zipline.data.bundles import register
from zipline.data.bundles.sharadar_bundle import sharadar_bundle

register('sharadar', sharadar_bundle(tickers=None, incremental=True, include_funds=True))
print("✓ Sharadar bundle registered")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

try:
    import alphalens as al
    print("✓ Alphalens imported")
except ImportError:
    print("⚠️  Install: pip install alphalens-reloaded")
    raise

from zipline.pipeline import Pipeline, CustomFactor
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.factors import AverageDollarVolume, Returns, RSI
from zipline.data.bundles import load
from zipline.utils.calendar_utils import get_calendar
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.loaders import USEquityPricingLoader

plt.rcParams['figure.figsize'] = (14, 8)
sns.set_style('darkgrid')

print("✓ Imports complete")

## Setup Pipeline Environment

In [None]:
bundle_data = load('sharadar')
trading_calendar = get_calendar('XNYS')

pricing_loader = USEquityPricingLoader.without_fx(
    bundle_data.equity_daily_bar_reader,
    bundle_data.adjustment_reader,
)

engine = SimplePipelineEngine(
    get_loader=lambda column: pricing_loader,
    asset_finder=bundle_data.asset_finder,
)

print("✓ Pipeline engine initialized")

## Define Multiple Factors

We'll create several factors representing different investment styles:
- **Momentum**: Trend-following
- **Mean Reversion**: Contrarian
- **Volatility**: Risk factor
- **Value**: RSI-based value signal

In [None]:
class Momentum(CustomFactor):
    """
    Price momentum: 60-day return.
    """
    inputs = [USEquityPricing.close]
    window_length = 60
    
    def compute(self, today, assets, out, close):
        out[:] = (close[-1] - close[0]) / close[0]


class ShortTermMomentum(CustomFactor):
    """
    Short-term momentum: 20-day return.
    """
    inputs = [USEquityPricing.close]
    window_length = 20
    
    def compute(self, today, assets, out, close):
        out[:] = (close[-1] - close[0]) / close[0]


class MeanReversion(CustomFactor):
    """
    Mean reversion: % deviation from 20-day MA.
    """
    inputs = [USEquityPricing.close]
    window_length = 20
    
    def compute(self, today, assets, out, close):
        ma = np.mean(close, axis=0)
        out[:] = (close[-1] - ma) / ma


class Volatility(CustomFactor):
    """
    Price volatility: std dev of returns.
    """
    inputs = [USEquityPricing.close]
    window_length = 20
    
    def compute(self, today, assets, out, close):
        returns = np.diff(close, axis=0) / close[:-1]
        out[:] = np.std(returns, axis=0)


print("✓ Factors defined")

## Create Multi-Factor Pipeline

Compute all factors simultaneously for our universe.

In [None]:
def make_multi_factor_pipeline():
    """
    Pipeline that computes multiple factors.
    """
    # Calculate raw factors
    momentum = Momentum()
    short_momentum = ShortTermMomentum()
    mean_reversion = MeanReversion()
    volatility = Volatility()
    rsi_factor = RSI()
    
    # Universe: top 500 liquid stocks
    dollar_volume = AverageDollarVolume(window_length=30)
    universe = dollar_volume.top(500)
    
    # Normalize factors to z-scores (for combining)
    momentum_z = momentum.zscore(mask=universe)
    short_momentum_z = short_momentum.zscore(mask=universe)
    mean_reversion_z = mean_reversion.zscore(mask=universe)
    volatility_z = volatility.zscore(mask=universe)
    rsi_z = rsi_factor.zscore(mask=universe)
    
    # Composite factor: Combine momentum and low volatility
    momentum_quality = momentum_z - volatility_z
    
    # Another composite: Momentum + Mean Reversion (momentum with pullback)
    momentum_pullback = momentum_z - 0.5 * mean_reversion_z
    
    return Pipeline(
        columns={
            # Raw factors
            'momentum': momentum,
            'short_momentum': short_momentum,
            'mean_reversion': mean_reversion,
            'volatility': volatility,
            'rsi': rsi_factor,
            # Normalized factors
            'momentum_z': momentum_z,
            'short_momentum_z': short_momentum_z,
            'mean_reversion_z': mean_reversion_z,
            'volatility_z': volatility_z,
            'rsi_z': rsi_z,
            # Composite factors
            'momentum_quality': momentum_quality,
            'momentum_pullback': momentum_pullback,
        },
        screen=universe
    )

print("✓ Multi-factor pipeline defined")

## Run Pipeline and Get Pricing Data

In [None]:
# Define research period
start_date = pd.Timestamp('2022-01-01')
end_date = pd.Timestamp('2023-06-30')

print(f"Running pipeline from {start_date.date()} to {end_date.date()}...")

pipeline = make_multi_factor_pipeline()
factor_data = engine.run_pipeline(pipeline, start_date, end_date)

print(f"\n✓ Pipeline complete: {len(factor_data):,} observations")

# Get pricing data
assets = factor_data.index.get_level_values(1).unique()
pricing_end = end_date + pd.Timedelta(days=30)

pricing_data = bundle_data.equity_daily_bar_reader.load_raw_arrays(
    columns=['close'],
    start_date=start_date,
    end_date=pricing_end,
    assets=assets,
)

dates = trading_calendar.sessions_in_range(start_date, pricing_end)
prices = pd.DataFrame(
    data=pricing_data[0].T,
    index=dates,
    columns=assets
)

print(f"✓ Pricing data loaded: {prices.shape}")

## Compare Individual Factors

Test each factor separately to see which ones are predictive.

In [None]:
# Factors to test
factors_to_test = [
    'momentum',
    'short_momentum',
    'mean_reversion',
    'volatility',
    'rsi',
]

# Store IC for each factor
factor_ics = {}

for factor_name in factors_to_test:
    print(f"\nAnalyzing {factor_name}...")
    
    factor = factor_data[factor_name]
    
    try:
        # Get clean factor data
        factor_clean = al.utils.get_clean_factor_and_forward_returns(
            factor=factor,
            prices=prices,
            quantiles=5,
            periods=(1, 5, 10),
            max_loss=0.35,
        )
        
        # Calculate IC
        ic = al.performance.factor_information_coefficient(factor_clean)
        factor_ics[factor_name] = ic
        
        print(f"  Mean IC (5D): {ic['5D'].mean():.4f}")
        print(f"  IC Std (5D): {ic['5D'].std():.4f}")
        print(f"  IC t-stat: {ic['5D'].mean() / ic['5D'].std() * np.sqrt(len(ic)):.2f}")
        
    except Exception as e:
        print(f"  Error: {e}")

print("\n✓ Individual factor analysis complete")

## Compare Factor Performance

Visualize IC for all factors side-by-side.

In [None]:
# Combine all ICs
all_ics = pd.DataFrame({name: ic['5D'] for name, ic in factor_ics.items()})

# Plot IC comparison
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# IC over time
all_ics.plot(ax=axes[0], alpha=0.7)
axes[0].axhline(0, color='black', linestyle='-', linewidth=0.8)
axes[0].set_title('Information Coefficient Comparison (5-Day)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('IC')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# IC statistics (mean and std)
ic_stats = pd.DataFrame({
    'Mean IC': all_ics.mean(),
    'IC Std': all_ics.std(),
    'IR (Mean/Std)': all_ics.mean() / all_ics.std(),
})

ic_stats['Mean IC'].plot(kind='bar', ax=axes[1], color='steelblue', alpha=0.7)
axes[1].axhline(0, color='black', linestyle='-', linewidth=0.8)
axes[1].set_title('Mean Information Coefficient by Factor', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Mean IC')
axes[1].set_xlabel('Factor')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nFactor IC Statistics:")
print("="*60)
print(ic_stats.sort_values('Mean IC', ascending=False))

## Analyze Composite Factors

Test our composite factors that combine multiple signals.

In [None]:
composite_factors = ['momentum_quality', 'momentum_pullback']

for factor_name in composite_factors:
    print(f"\n{'='*80}")
    print(f"Analyzing Composite Factor: {factor_name}")
    print(f"{'='*80}\n")
    
    factor = factor_data[factor_name]
    
    # Get clean factor data
    factor_clean = al.utils.get_clean_factor_and_forward_returns(
        factor=factor,
        prices=prices,
        quantiles=5,
        periods=(1, 5, 10),
        max_loss=0.35,
    )
    
    # Create returns tearsheet
    al.tears.create_returns_tear_sheet(factor_clean)

## Build a Combined Multi-Factor Score

Create a weighted combination of the best factors.

In [None]:
# Based on IC analysis, weight factors
# (Adjust these weights based on your IC results above)

weights = {
    'momentum_z': 0.4,
    'short_momentum_z': 0.2,
    'volatility_z': -0.3,  # Negative weight for low volatility
    'rsi_z': 0.1,
}

# Create combined score
combined_score = sum(factor_data[name] * weight for name, weight in weights.items())

print("Multi-Factor Combined Score:")
print(f"  Weights: {weights}")
print(f"\nAnalyzing combined factor...")

# Analyze combined factor
combined_clean = al.utils.get_clean_factor_and_forward_returns(
    factor=combined_score,
    prices=prices,
    quantiles=5,
    periods=(1, 5, 10),
    max_loss=0.35,
)

print("\n" + "="*80)
print("COMBINED MULTI-FACTOR ANALYSIS")
print("="*80 + "\n")

al.tears.create_full_tear_sheet(combined_clean)

## Factor Correlation Analysis

Understand how factors relate to avoid redundancy.

In [None]:
# Get latest cross-section
latest_date = factor_data.index.get_level_values(0).max()
latest_factors = factor_data.loc[latest_date]

# Calculate correlations
factor_corr = latest_factors[factors_to_test].corr()

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(
    factor_corr,
    annot=True,
    fmt='.2f',
    cmap='RdYlGn',
    center=0,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=1
)
plt.title('Factor Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nFactor Correlations:")
print("="*60)
print(factor_corr)
print("\nLow correlation = factors provide diversified signals")
print("High correlation = factors may be redundant")

## Summary & Recommendations

Based on this analysis:

### Key Questions to Answer:

1. **Which individual factors are predictive?**
   - Look at Mean IC and t-statistics
   - IC > 0.05 with t-stat > 2 is good

2. **Are factors correlated?**
   - Low correlation = good diversification
   - High correlation = consider removing redundant factors

3. **Do composite factors improve performance?**
   - Compare IC of composite vs individual factors
   - Check Sharpe ratio of factor returns

4. **What are optimal factor weights?**
   - Can use IC-based weights
   - Or equal-weight uncorrelated factors
   - Consider inverse volatility weighting

### Next Steps:

1. **Optimize factor weights** - Use historical IC to determine weights
2. **Test stability** - Run on different time periods
3. **Account for costs** - Factor turnover analysis
4. **Build strategy** - Use top-ranked stocks in a portfolio
5. **Backtest** - Test full strategy with transaction costs

### Research Best Practices:

- Test on out-of-sample data
- Avoid overfitting (don't optimize too much)
- Consider fundamental reasons why factors work
- Monitor factor decay and regime changes
- Account for transaction costs and market impact