# Load CSV Fundamental Data into Zipline Custom Database

This notebook demonstrates how to:
1. Load fundamental data from CSV files
2. Map symbols to Zipline SIDs
3. Create a custom SQLite database
4. Use the data in Zipline Pipeline

This is a zipline-reloaded native approach (no QuantRocket dependencies).

## 1. Setup and Imports

In [1]:
import os
import glob
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path

# Zipline imports
from zipline.data.bundles import load as load_bundle, register
from zipline.data.bundles.sharadar_bundle import sharadar_bundle
from zipline.pipeline import Pipeline
from zipline.pipeline.data.db import Database, Column

# Register Sharadar bundle (in case extension.py didn't load)
try:
    # Try to register the bundle
    register(
        'sharadar',
        sharadar_bundle(
            tickers=None,
            incremental=True,
            include_funds=True,
        ),
    )
    print("✓ Registered Sharadar bundle")
except Exception as e:
    # Bundle may already be registered
    print(f"✓ Sharadar bundle already registered (or error: {e})")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 120)

print("✓ Imports complete")

✓ Registered Sharadar bundle
✓ Imports complete


## 2. Configuration

Set your database name and data directory paths.

In [2]:
# Configuration
DATABASE_NAME = "fundamentals"  # Name for your custom database
DATA_DIR = "/data/csv/"  # Directory with CSV files (persistent across Docker restarts)
VIX_SIGNAL_PATH = "/data/csv/vix_flag.csv"  # Optional VIX signal data

# Database will be created in ~/.zipline/data/custom/
DB_DIR = Path('/root/.zipline/data/custom')
DB_DIR.mkdir(parents=True, exist_ok=True)
DB_PATH = DB_DIR / f"{DATABASE_NAME}.sqlite"

# Database update mode:
# 'fresh' - Drop and recreate database (default)
# 'replace' - Insert or replace existing records (updates duplicates)
# 'ignore' - Insert or ignore (skips duplicates, keeps existing data)
UPDATE_MODE = 'fresh'  # Change to 'fresh', 'replace', or 'ignore'

print(f"Database will be created at: {DB_PATH}")
print(f"Update mode: {UPDATE_MODE}")
print(f"  - 'fresh': Drop and recreate database")
print(f"  - 'replace': Update existing records with new data")
print(f"  - 'ignore': Skip records that already exist")
print(f"\nLooking for CSV files in: {DATA_DIR}")
print(f"\n💡 Tip: Place your CSV files in /data/csv/ (inside container)")
print(f"   or ./data/csv/ (on host machine) for persistent storage")

Database will be created at: /root/.zipline/data/custom/fundamentals.sqlite
Update mode: fresh
  - 'fresh': Drop and recreate database
  - 'replace': Update existing records with new data
  - 'ignore': Skip records that already exist

Looking for CSV files in: /data/csv/

💡 Tip: Place your CSV files in /data/csv/ (inside container)
   or ./data/csv/ (on host machine) for persistent storage


## 3. Define Database Schema

Define the columns that will be in your custom database.

In [3]:
# Define your database schema
# This matches the columns from the QuantRocket example
SCHEMA = {
    'Symbol': 'TEXT',
    'Sid': 'INTEGER',
    'Date': 'TEXT',
    'RefPriceClose': 'REAL',
    'RefVolume': 'REAL',
    'CompanyCommonName': 'TEXT',
    'EnterpriseValue_DailyTimeSeries_': 'REAL',
    'CompanyMarketCap': 'REAL',
    'GICSSectorName': 'TEXT',
    'FOCFExDividends_Discrete': 'REAL',
    'InterestExpense_NetofCapitalizedInterest': 'REAL',
    'Debt_Total': 'REAL',
    'EarningsPerShare_Actual': 'REAL',
    'EarningsPerShare_SmartEstimate_prev_Q': 'REAL',
    'EarningsPerShare_ActualSurprise': 'REAL',
    'EarningsPerShare_SmartEstimate_current_Q': 'REAL',
    'LongTermGrowth_Mean': 'REAL',
    'PriceTarget_Median': 'REAL',
    'CombinedAlphaModelSectorRank': 'REAL',
    'CombinedAlphaModelSectorRankChange': 'REAL',
    'CombinedAlphaModelRegionRank': 'REAL',
    'TradeDate': 'TEXT',
    'EPS_SurpirsePrct_prev_Q': 'REAL',
    'Estpricegrowth_percent': 'REAL',
    'CashFlowComponent_Current': 'REAL',
    'EarningsQualityRegionRank_Current': 'REAL',
    'EnterpriseValueToEBIT_DailyTimeSeriesRatio_': 'REAL',
    'EnterpriseValueToEBITDA_DailyTimeSeriesRatio_': 'REAL',
    'EnterpriseValueToSales_DailyTimeSeriesRatio_': 'REAL',
    'Dividend_Per_Share_SmartEstimate': 'REAL',
    'CashFlowPerShare_BrokerEstimate': 'REAL',
    'FreeCashFlowPerShare_BrokerEstimate': 'REAL',
    'ForwardPEG_DailyTimeSeriesRatio_': 'REAL',
    'PriceEarningsToGrowthRatio_SmartEstimate_': 'REAL',
    'ReturnOnInvestedCapital_BrokerEstimate': 'REAL',
    'Recommendation_NumberOfTotal': 'REAL',
    'Recommendation_Median_1_5_': 'REAL',
    'Recommendation_NumberOfStrongBuy': 'REAL',
    'Recommendation_NumberOfBuy': 'REAL',
    'Recommendation_Mean_1_5_': 'REAL',
    'ReturnOnCapitalEmployed_Actual': 'REAL',
    'GrossProfitMargin_': 'REAL',
    'ReturnOnEquity_SmartEstimat': 'REAL',
    'ReturnOnAssets_SmartEstimate': 'REAL',
    'CashCashEquivalents_Total': 'REAL',
    'ForwardPriceToCashFlowPerShare_DailyTimeSeriesRatio_': 'REAL',
    'ForwardPriceToSalesPerShare_DailyTimeSeriesRatio_': 'REAL',
    'ForwardEnterpriseValueToOperatingCashFlow_DailyTimeSeriesRatio_': 'REAL',
    'GrossProfitMargin_ActualSurprise': 'REAL',
    'pred': 'REAL',  # VIX signal
}

print(f"✓ Schema defined with {len(SCHEMA)} columns")

✓ Schema defined with 50 columns


## 4. Load CSV Files

Load all CSV files from the data directory and concatenate them.

In [4]:
# Find all CSV files
os.chdir(DATA_DIR)
csv_files = sorted(glob.glob('LSEG_*.csv'))

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files[:5]:  # Show first 5
    print(f"  - {f}")
if len(csv_files) > 5:
    print(f"  ... and {len(csv_files) - 5} more")

# Load and concatenate all CSV files
print("\nLoading CSV files...")
custom_data = pd.DataFrame()

for csv_file in csv_files:
    print(f"  Loading {csv_file}...")
    df = pd.read_csv(os.path.join(DATA_DIR, csv_file))
    custom_data = pd.concat([custom_data, df], ignore_index=True)

print(f"\n✓ Loaded {len(custom_data):,} total rows")
print(f"Date range: {custom_data['Date'].min()} to {custom_data['Date'].max()}")
print(f"Unique symbols: {custom_data['Symbol'].nunique()}")

# Show sample
print("\nSample data:")
custom_data.head()

Found 1 CSV files:
  - LSEG_20091231_20251111.csv

Loading CSV files...
  Loading LSEG_20091231_20251111.csv...

✓ Loaded 8,995,098 total rows
Date range: 2008-10-16 to 2025-11-11
Unique symbols: 4426

Sample data:


Unnamed: 0,Date,Symbol,Instrument,RefPriceClose,RefVolume,CompanyCommonName,EnterpriseValue_DailyTimeSeries_,CompanyMarketCap,GICSSectorName,FOCFExDividends_Discrete,InterestExpense_NetofCapitalizedInterest,Debt_Total,EarningsPerShare_Actual,EarningsPerShare_SmartEstimate_prev_Q,EarningsPerShare_ActualSurprise,EarningsPerShare_SmartEstimate_current_Q,LongTermGrowth_Mean,PriceTarget_Median,CombinedAlphaModelSectorRank,CombinedAlphaModelSectorRankChange,CombinedAlphaModelRegionRank,EarningsQualityRegionRank_Current,EnterpriseValueToEBIT_DailyTimeSeriesRatio_,EnterpriseValueToEBITDA_DailyTimeSeriesRatio_,EnterpriseValueToSales_DailyTimeSeriesRatio_,Dividend_Per_Share_SmartEstimate,CashCashEquivalents_Total,ForwardPEG_DailyTimeSeriesRatio_,PriceEarningsToGrowthRatio_SmartEstimate_,Recommendation_Median_1_5_,ReturnOnEquity_SmartEstimat,ReturnOnAssets_SmartEstimate,ForwardPriceToCashFlowPerShare_DailyTimeSeriesRatio_,ForwardPriceToSalesPerShare_DailyTimeSeriesRatio_,ForwardEnterpriseValueToOperatingCashFlow_DailyTimeSeriesRatio_,GrossProfitMargin_ActualSurprise,Estpricegrowth_percent,TradeDate
0,2008-10-16,APGT,APGT.PK,0.025,,Appgate Inc,13205550.0,343938.8,Information Technology,-14000.0,,,,,,,,,,,,,,,,,2000.0,,,,,,,,,,,2008-10-16
1,2009-04-22,FIZN,FIZN.PK,20.389298,,First Citizens Bancshares Inc (Tennessee),126856629.0,77937630.0,Financials,-753000.0,,110211000.0,,,,,,,,,,,8.229962,7.304464,2.417837,,,,,,,,,,,,,2009-04-22
2,2009-08-26,EMOR,EMOR.PK,0.04,,Healixa Inc,,90616.19,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009-08-26
3,2009-08-26,UCIX,UCIX.PK,121.2,,Umbra Companies Inc,4970010.0,4949000.0,Industrials,8960.0,,5299.0,,,,,,,,,,,3427.593103,3427.593103,,,,,,,,,,,,,,2009-08-26
4,2009-11-23,HBIA,HBIA.PK,24.5,,Hills Bancorp,188291469.0,216020500.0,Financials,13470000.0,,504874000.0,,,,,,,,,,,5.76732,5.379756,1.953353,,114066000.0,,,,,,,,,,,2009-11-23


## 5. Optional: Load Recent Data Only

To reduce memory usage, you can filter to recent data only.

In [5]:
# Optional: Keep only recent data (e.g., last 600,000 rows)
# Comment out if you want all historical data
# RECENT_ROWS = 600000

# if len(custom_data) > RECENT_ROWS:
#     print(f"Filtering to most recent {RECENT_ROWS:,} rows...")
#     custom_data = custom_data.tail(RECENT_ROWS).copy()
#     print(f"✓ Filtered. New date range: {custom_data['Date'].min()} to {custom_data['Date'].max()}")
# else:
#     print(f"Dataset has {len(custom_data):,} rows - no filtering needed")

## 6. Map Symbols to Zipline SIDs (WITH TEMPORAL MAPPING)

Map your symbols to Zipline Security IDs (SIDs) using **temporal lookups** to handle symbol changes.

**IMPORTANT UPDATE**: This notebook now uses **temporal SID mapping** which correctly handles:
- Company name changes (FB → META, etc.)
- Ticker symbol changes over time
- Mergers and acquisitions

The temporal mapper uses `asset_finder.lookup_symbol(symbol, as_of_date)` to get the correct SID for each row's date, ensuring continuous data for companies that changed symbols.

**How it works:**
- For a row with Symbol='FB', Date='2020-01-01' → Returns META's SID (the company)
- For a row with Symbol='META', Date='2023-01-01' → Returns same META SID
- Result: Continuous data under one SID, no breaks at symbol changes

In [6]:
# =============================================================================
# TEMPORAL SID MAPPING - Handles symbol changes automatically
# =============================================================================

import sys
sys.path.insert(0, '/app/examples/custom_data')

# Load the Sharadar bundle to get the asset finder
print("="*80)
print("TEMPORAL SID MAPPING")
print("="*80)
print("Loading Sharadar bundle...")

bundle_timestamp = pd.Timestamp.now(tz='UTC')
bundle_data = load_bundle('sharadar', timestamp=bundle_timestamp)
asset_finder = bundle_data.asset_finder

print(f"✓ Asset finder loaded with {len(asset_finder.sids):,} securities")

# Import temporal mapper
from temporal_sid_mapper import TemporalSIDMapper

print(f"\nMapping {len(custom_data):,} rows to SIDs...")
print("Strategy: Using temporal lookups (handles FB→META, etc.)")

# Create temporal mapper
mapper = TemporalSIDMapper(asset_finder)

# Map SIDs using automatic strategy selection
custom_data['Sid'] = mapper.map_dataframe_auto(
    custom_data,
    symbol_col='Symbol',
    date_col='Date',
    verbose=True
)

# Report results
mapped = custom_data['Sid'].notna().sum()
unmapped = custom_data['Sid'].isna().sum()

print(f"\n" + "="*80)
print("MAPPING RESULTS")
print("="*80)
print(f"Mapped:   {mapped:,} rows ({mapped/len(custom_data)*100:.1f}%)")
print(f"Unmapped: {unmapped:,} rows ({unmapped/len(custom_data)*100:.1f}%)")

if unmapped > 0:
    unmapped_symbols = custom_data[custom_data['Sid'].isna()]['Symbol'].unique()
    print(f"\nUnmapped symbols (first 10): {list(unmapped_symbols[:10])}")
    print("Note: These symbols may not exist in the Sharadar bundle")

# Remove unmapped rows
print(f"\nRemoving unmapped rows...")
custom_data = custom_data[custom_data['Sid'].notna()].copy()
custom_data['Sid'] = custom_data['Sid'].astype(int)

print(f"✓ Final dataset: {len(custom_data):,} rows with valid SIDs")
print("="*80)

# Verify continuity for a known symbol change
print("\nVerifying FB→META continuity:")
try:
    fb_2020_sid = mapper.map_single_row('FB', '2020-01-01')
    meta_2023_sid = mapper.map_single_row('META', '2023-01-01')
    
    print(f"  FB (2020-01-01):   SID {fb_2020_sid}")
    print(f"  META (2023-01-01): SID {meta_2023_sid}")
    
    if fb_2020_sid == meta_2023_sid:
        print(f"  ✓ Same SID - continuous data maintained!")
    else:
        print(f"  ⚠ Different SIDs - this shouldn't happen")
except Exception as e:
    print(f"  Note: {e}")

TEMPORAL SID MAPPING
Loading Sharadar bundle...
✓ Asset finder loaded with 30,701 securities

Mapping 8,995,098 rows to SIDs...
Strategy: Using temporal lookups (handles FB→META, etc.)
Dataset > 1M rows: Using parallel processing
Mapping 8,995,098 rows using 16 parallel workers...
  Split into 17 chunks of ~562,193 rows each
  Completed chunk 1/17
  Completed chunk 2/17
  Completed chunk 3/17
  Completed chunk 4/17
  Completed chunk 5/17
  Completed chunk 6/17
  Completed chunk 7/17
  Completed chunk 8/17
  Completed chunk 9/17
  Completed chunk 10/17
  Completed chunk 11/17
  Completed chunk 12/17
  Completed chunk 13/17
  Completed chunk 14/17
  Completed chunk 15/17
  Completed chunk 16/17
  Completed chunk 17/17

MAPPING RESULTS
Mapped:   8,549,158 rows (95.0%)
Unmapped: 445,940 rows (5.0%)

Unmapped symbols (first 10): ['APGT', 'FIZN', 'EMOR', 'UCIX', 'HBIA', 'KAHL', 'ABIT', 'DBIN', 'CLPE', 'CNND']
Note: These symbols may not exist in the Sharadar bundle

Removing unmapped rows...

## 7. Merge VIX Signal Data (Optional)

If you have additional data like VIX signals, merge it here.

In [7]:
# Load VIX signal data if available
if os.path.exists(VIX_SIGNAL_PATH):
    print(f"Loading VIX signal from {VIX_SIGNAL_PATH}...")
    vix_signal = pd.read_csv(VIX_SIGNAL_PATH)
    
    # Standardize column names
    vix_signal.rename(columns={'symbol': 'Symbol', 'date': 'Date'}, inplace=True)
    vix_signal['Date'] = pd.to_datetime(vix_signal['Date'])
    
    # Merge with custom data
    custom_data['Date'] = pd.to_datetime(custom_data['Date'])
    custom_data = pd.merge(custom_data, vix_signal[['Symbol', 'Date', 'pred']], 
                          on=['Symbol', 'Date'], how='left')
    
    print(f"✓ Merged VIX signal data")
else:
    print(f"VIX signal file not found at {VIX_SIGNAL_PATH}")
    print("Skipping VIX merge (this is optional)")

Loading VIX signal from /data/csv/vix_flag.csv...


  vix_signal['Date'] = pd.to_datetime(vix_signal['Date'])


✓ Merged VIX signal data


## 8. Data Cleaning

Clean and prepare data for database insertion.

In [8]:
print("Cleaning data...")

# Ensure Date is datetime
custom_data['Date'] = pd.to_datetime(custom_data['Date'])

# Forward fill missing values by symbol
print("  Forward filling missing values by symbol...")
for col in custom_data.columns:
    if col not in ['Symbol', 'Sid', 'Date']:
        custom_data[col] = custom_data.groupby('Symbol')[col].transform(lambda x: x.ffill())

# Handle text columns vs numeric columns differently when filling remaining NaNs
print("  Filling remaining NaN values...")
for col in custom_data.columns:
    if col not in ['Symbol', 'Sid', 'Date']:
        # Check if column is text/object type
        if custom_data[col].dtype == 'object' or col in ['GICSSectorName', 'CompanyCommonName', 'TradeDate']:
            # Fill text columns with empty string
            custom_data[col] = custom_data[col].fillna('')
        else:
            # Fill numeric columns with 0
            custom_data[col] = custom_data[col].fillna(0)

# Convert Sid to integer
custom_data['Sid'] = custom_data['Sid'].astype(int)

# Sort by date and symbol
custom_data = custom_data.sort_values(['Date', 'Symbol'])

print(f"✓ Data cleaned")
print(f"\nFinal dataset:")
print(f"  Rows: {len(custom_data):,}")
print(f"  Columns: {len(custom_data.columns)}")
print(f"  Date range: {custom_data['Date'].min()} to {custom_data['Date'].max()}")
print(f"  Symbols: {custom_data['Symbol'].nunique()}")

# Verify text columns don't have numeric 0
text_cols = ['GICSSectorName', 'CompanyCommonName']
for col in text_cols:
    if col in custom_data.columns:
        zero_count = (custom_data[col] == 0).sum() + (custom_data[col] == '0').sum()
        if zero_count > 0:
            print(f"  WARNING: {col} has {zero_count} rows with numeric 0!")
        else:
            print(f"  ✓ {col}: No numeric zeros, {(custom_data[col] == '').sum()} empty strings")

# Show sample
print("\nSample cleaned data:")
custom_data.head()

Cleaning data...
  Forward filling missing values by symbol...
  Filling remaining NaN values...
✓ Data cleaned

Final dataset:
  Rows: 8,549,158
  Columns: 40
  Date range: 2009-12-29 00:00:00 to 2025-11-11 00:00:00
  Symbols: 3953
  ✓ GICSSectorName: No numeric zeros, 676555 empty strings
  ✓ CompanyCommonName: No numeric zeros, 14125 empty strings

Sample cleaned data:


Unnamed: 0,Date,Symbol,Instrument,RefPriceClose,RefVolume,CompanyCommonName,EnterpriseValue_DailyTimeSeries_,CompanyMarketCap,GICSSectorName,FOCFExDividends_Discrete,InterestExpense_NetofCapitalizedInterest,Debt_Total,EarningsPerShare_Actual,EarningsPerShare_SmartEstimate_prev_Q,EarningsPerShare_ActualSurprise,EarningsPerShare_SmartEstimate_current_Q,LongTermGrowth_Mean,PriceTarget_Median,CombinedAlphaModelSectorRank,CombinedAlphaModelSectorRankChange,CombinedAlphaModelRegionRank,EarningsQualityRegionRank_Current,EnterpriseValueToEBIT_DailyTimeSeriesRatio_,EnterpriseValueToEBITDA_DailyTimeSeriesRatio_,EnterpriseValueToSales_DailyTimeSeriesRatio_,Dividend_Per_Share_SmartEstimate,CashCashEquivalents_Total,ForwardPEG_DailyTimeSeriesRatio_,PriceEarningsToGrowthRatio_SmartEstimate_,Recommendation_Median_1_5_,ReturnOnEquity_SmartEstimat,ReturnOnAssets_SmartEstimate,ForwardPriceToCashFlowPerShare_DailyTimeSeriesRatio_,ForwardPriceToSalesPerShare_DailyTimeSeriesRatio_,ForwardEnterpriseValueToOperatingCashFlow_DailyTimeSeriesRatio_,GrossProfitMargin_ActualSurprise,Estpricegrowth_percent,TradeDate,Sid,pred
0,2009-12-29,PRG,PRG.N,12.905048,0.0,PROG Holdings Inc,141779600.0,174526600.0,Financials,29367000.0,2020000.0,114817000.0,0.3,0.0,5.996,0.0,12.5,23.33333,20.0,0.0,25.0,55.0,0.808409,0.651261,0.082844,0.0,88085000.0,0.678299,0.0,1.0,0.0,0.0,0.0,0.078117,0.0,1.608,0.808078,2009-12-29,198929,0.0
1,2009-12-30,HCI,HCI.N,7.75,0.0,Hci Group Inc,-10520450.0,50581550.0,Financials,-5146000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,-0.420314,-0.419258,-0.140396,0.0,61102000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2009-12-30,193989,0.0
2,2009-12-31,A,A.N,22.217753,0.0,Agilent Technologies Inc,11257190000.0,10838190000.0,Health Care,183000000.0,21000000.0,2904000000.0,0.32,0.0,37.404,0.0,15.0,33.0,40.0,0.0,41.0,40.0,32.535225,22.33569,2.512204,0.0,2479000000.0,0.993966,0.0,2.0,0.0,0.0,13.785078,1.609831,20.022743,4.081,0.485299,2009-12-31,196290,0.0
3,2009-12-31,AAON,AAON.OQ,5.774823,0.0,Aaon Inc,317775300.0,335578300.0,Industrials,7037000.0,0.0,3064000.0,0.13333,0.0,17.389,0.0,0.0,6.51852,47.0,0.0,46.0,70.0,7.359657,6.096876,1.264712,0.0,17894000.0,0.0,0.0,3.0,0.0,0.0,0.0,1.465819,0.0,24.99,0.128783,2009-12-31,198259,0.0
4,2009-12-31,AAP,AAP.N,40.48,0.0,Advance Auto Parts Inc,3945866000.0,3830672000.0,Consumer Discretionary,152861000.0,10678000.0,0.0,0.69,0.0,4.625,0.0,13.2,47.0,62.0,0.0,65.0,99.0,8.886985,6.663693,0.722495,0.0,216215000.0,0.937168,0.0,2.0,0.0,0.0,8.585366,0.679843,8.843541,-0.367,0.161067,2009-12-31,195735,0.0


## 9. Create SQLite Database

Create the custom SQLite database in Zipline format.

The notebook supports three update modes (configured in Cell 2):
- **`fresh`**: Drop and recreate the database (default for initial load)
- **`replace`**: INSERT OR REPLACE - Updates existing records based on (Sid, Date) key
- **`ignore`**: INSERT OR IGNORE - Skips records that already exist, keeps existing data

Use `replace` mode to update data with newer values, or `ignore` mode to only add new data without overwriting existing records.

In [None]:
# =============================================================================# DEDUPLICATE DATA - Fix for UNIQUE constraint failures# =============================================================================print("\n" + "=" * 60)print("DEDUPLICATING DATA")print("=" * 60)# Count beforerows_before = len(custom_data)print(f"Rows before deduplication: {rows_before:,}")# Check for duplicatesduplicates = custom_data[custom_data.duplicated(subset=['Sid', 'Date'], keep=False)]if len(duplicates) > 0:    print(f"⚠️  Found {len(duplicates):,} rows with duplicate (Sid, Date) pairs")        # Show sample    dup_counts = duplicates.groupby(['Sid', 'Date']).size().reset_index(name='count')    dup_counts = dup_counts.sort_values('count', ascending=False).head(5)    print("\nTop 5 most duplicated (Sid, Date) pairs:")    print(dup_counts.to_string(index=False))# Deduplicate - keep last occurrence (most recent data)custom_data = custom_data.drop_duplicates(subset=['Sid', 'Date'], keep='last')# Count afterrows_after = len(custom_data)duplicates_removed = rows_before - rows_afterprint(f"\nRows after deduplication: {rows_after:,}")print(f"Duplicates removed: {duplicates_removed:,}")if duplicates_removed == 0:    print("✓ No duplicates found - data is clean!")else:    print(f"✓ Removed {duplicates_removed:,} duplicate records")    print("  Strategy: Kept 'last' occurrence for each (Sid, Date) pair")print("=" * 60)

In [9]:
print(f"Creating database at {DB_PATH}...")

# Handle database based on update mode
db_exists = DB_PATH.exists()

if UPDATE_MODE == 'fresh' and db_exists:
    print(f"  Removing existing database (mode='fresh')...")
    DB_PATH.unlink()
    db_exists = False
elif db_exists:
    print(f"  Database exists - will {UPDATE_MODE} existing records...")

# Create database connection
conn = sqlite3.connect(str(DB_PATH))
cursor = conn.cursor()

# Create table if it doesn't exist (with UNIQUE constraint for upserts)
if not db_exists or UPDATE_MODE == 'fresh':
    columns_def = ', '.join([f'"{col}" {dtype}' for col, dtype in SCHEMA.items()])
    create_table_sql = f'''
    CREATE TABLE IF NOT EXISTS Price (
        {columns_def},
        UNIQUE(Sid, Date)
    );
    '''
    
    print("  Creating table...")
    cursor.execute(create_table_sql)
    
    # Create indices for fast lookups
    print("  Creating indices...")
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_sid ON Price(Sid);')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_date ON Price(Date);')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_symbol ON Price(Symbol);')

# Insert data
print(f"  Inserting {len(custom_data):,} rows with mode='{UPDATE_MODE}'...")

# Prepare data for insertion - only use columns that exist in custom_data
# Add missing columns with default values (0 for numeric, empty string for text)
insert_data = custom_data.copy()

# Add any missing schema columns with appropriate defaults
missing_cols = []
for col, dtype in SCHEMA.items():
    if col not in insert_data.columns:
        if dtype == 'TEXT':
            insert_data[col] = ''
        else:  # REAL or INTEGER
            insert_data[col] = 0
        missing_cols.append(col)

if missing_cols:
    print(f"  Added {len(missing_cols)} missing columns with default values")

# Select only the columns in the schema (in the correct order)
insert_data = insert_data[list(SCHEMA.keys())].copy()

# Convert Date to string format for SQLite
insert_data['Date'] = insert_data['Date'].dt.strftime('%Y-%m-%d')

# Choose SQL command based on update mode
if UPDATE_MODE == 'replace':
    sql_command = 'INSERT OR REPLACE'
elif UPDATE_MODE == 'ignore':
    sql_command = 'INSERT OR IGNORE'
else:  # 'fresh' or default
    sql_command = 'INSERT'

# Create parameterized INSERT statement
columns = list(SCHEMA.keys())
placeholders = ', '.join(['?' for _ in columns])
column_names = ', '.join([f'"{col}"' for col in columns])
insert_sql = f'{sql_command} INTO Price ({column_names}) VALUES ({placeholders})'

# Insert in chunks for better performance
chunk_size = 10000
total_chunks = (len(insert_data) + chunk_size - 1) // chunk_size
total_inserted = 0
total_skipped = 0

for i in range(0, len(insert_data), chunk_size):
    chunk = insert_data.iloc[i:i+chunk_size]
    
    # Execute batch insert
    cursor.executemany(insert_sql, chunk.values.tolist())
    
    rows_affected = cursor.rowcount
    if UPDATE_MODE == 'ignore':
        # With INSERT OR IGNORE, rowcount shows actual inserts (not skipped)
        total_inserted += rows_affected
        total_skipped += len(chunk) - rows_affected
    else:
        total_inserted += rows_affected
    
    chunk_num = i // chunk_size + 1
    if chunk_num % 10 == 0 or chunk_num == total_chunks:
        print(f"    Processed chunk {chunk_num}/{total_chunks} ({i+len(chunk):,} rows)...")

conn.commit()

# Report results
print(f"\n✓ Database operation completed!")
print(f"  Path: {DB_PATH}")
print(f"  Mode: {UPDATE_MODE}")
print(f"  Rows processed: {len(insert_data):,}")

if UPDATE_MODE == 'ignore' and total_skipped > 0:
    print(f"  Rows inserted: {total_inserted:,}")
    print(f"  Rows skipped (already existed): {total_skipped:,}")

# Get final row count
cursor.execute("SELECT COUNT(*) FROM Price")
total_rows = cursor.fetchone()[0]
print(f"  Total rows in database: {total_rows:,}")

conn.close()

print(f"  Size: {DB_PATH.stat().st_size / 1024 / 1024:.1f} MB")

Creating database at /root/.zipline/data/custom/fundamentals.sqlite...
  Removing existing database (mode='fresh')...
  Creating table...
  Creating indices...
  Inserting 8,549,158 rows with mode='fresh'...
  Added 11 missing columns with default values
    Processed chunk 10/855 (100,000 rows)...
    Processed chunk 20/855 (200,000 rows)...
    Processed chunk 30/855 (300,000 rows)...
    Processed chunk 40/855 (400,000 rows)...
    Processed chunk 50/855 (500,000 rows)...
    Processed chunk 60/855 (600,000 rows)...
    Processed chunk 70/855 (700,000 rows)...
    Processed chunk 80/855 (800,000 rows)...
    Processed chunk 90/855 (900,000 rows)...
    Processed chunk 100/855 (1,000,000 rows)...
    Processed chunk 110/855 (1,100,000 rows)...
    Processed chunk 120/855 (1,200,000 rows)...
    Processed chunk 130/855 (1,300,000 rows)...
    Processed chunk 140/855 (1,400,000 rows)...
    Processed chunk 150/855 (1,500,000 rows)...
    Processed chunk 160/855 (1,600,000 rows)...
    

IntegrityError: UNIQUE constraint failed: Price.Sid, Price.Date

## 10. Define Database Class

Create a Database class to use this data in Zipline Pipeline.

In [None]:
# Define the Database class
class CustomFundamentals(Database):
    """
    Custom Custom fundamentals database.
    
    Usage in Pipeline:
        roe = CustomFundamentals.ReturnOnEquity_SmartEstimat.latest
        sector = CustomFundamentals.GICSSectorName.latest
    """
    
    CODE = DATABASE_NAME
    LOOKBACK_WINDOW = 252  # Days to look back
    
    # Price and volume
    RefPriceClose = Column(float)
    RefVolume = Column(float)
    
    # Company info
    CompanyCommonName = Column(str)
    GICSSectorName = Column(str)
    
    # Valuation metrics
    EnterpriseValue_DailyTimeSeries_ = Column(float)
    CompanyMarketCap = Column(float)
    
    # Cash flow
    FOCFExDividends_Discrete = Column(float)
    CashFlowComponent_Current = Column(float)
    CashFlowPerShare_BrokerEstimate = Column(float)
    FreeCashFlowPerShare_BrokerEstimate = Column(float)
    
    # Debt and interest
    InterestExpense_NetofCapitalizedInterest = Column(float)
    Debt_Total = Column(float)
    
    # Earnings
    EarningsPerShare_Actual = Column(float)
    EarningsPerShare_SmartEstimate_prev_Q = Column(float)
    EarningsPerShare_ActualSurprise = Column(float)
    EarningsPerShare_SmartEstimate_current_Q = Column(float)
    EPS_SurpirsePrct_prev_Q = Column(float)
    
    # Growth and targets
    LongTermGrowth_Mean = Column(float)
    PriceTarget_Median = Column(float)
    Estpricegrowth_percent = Column(float)
    
    # Rankings
    CombinedAlphaModelSectorRank = Column(float)
    CombinedAlphaModelSectorRankChange = Column(float)
    CombinedAlphaModelRegionRank = Column(float)
    EarningsQualityRegionRank_Current = Column(float)
    
    # Ratios
    EnterpriseValueToEBIT_DailyTimeSeriesRatio_ = Column(float)
    EnterpriseValueToEBITDA_DailyTimeSeriesRatio_ = Column(float)
    EnterpriseValueToSales_DailyTimeSeriesRatio_ = Column(float)
    ForwardPEG_DailyTimeSeriesRatio_ = Column(float)
    PriceEarningsToGrowthRatio_SmartEstimate_ = Column(float)
    ForwardPriceToCashFlowPerShare_DailyTimeSeriesRatio_ = Column(float)
    ForwardPriceToSalesPerShare_DailyTimeSeriesRatio_ = Column(float)
    ForwardEnterpriseValueToOperatingCashFlow_DailyTimeSeriesRatio_ = Column(float)
    
    # Returns
    ReturnOnInvestedCapital_BrokerEstimate = Column(float)
    ReturnOnCapitalEmployed_Actual = Column(float)
    ReturnOnEquity_SmartEstimat = Column(float)
    ReturnOnAssets_SmartEstimate = Column(float)
    
    # Margins
    GrossProfitMargin_ = Column(float)
    GrossProfitMargin_ActualSurprise = Column(float)
    
    # Analyst recommendations
    Recommendation_NumberOfTotal = Column(float)
    Recommendation_Median_1_5_ = Column(float)
    Recommendation_NumberOfStrongBuy = Column(float)
    Recommendation_NumberOfBuy = Column(float)
    Recommendation_Mean_1_5_ = Column(float)
    
    # Cash
    CashCashEquivalents_Total = Column(float)
    
    # Dividends
    Dividend_Per_Share_SmartEstimate = Column(float)
    
    # VIX prediction signal
    pred = Column(float)


print("✓ CustomFundamentals Database class defined")
print(f"  Database code: {CustomFundamentals.CODE}")
print(f"  Lookback window: {CustomFundamentals.LOOKBACK_WINDOW} days")
# Count columns by checking for 'dataset' attribute (BoundColumn instances have this)
print(f"  Columns defined: {len([attr for attr in dir(CustomFundamentals) if hasattr(getattr(CustomFundamentals, attr, None), 'dataset')])}")

print("\nExample usage:")
print("  roe = CustomFundamentals.ReturnOnEquity_SmartEstimat.latest")
print("  pe_growth = CustomFundamentals.PriceEarningsToGrowthRatio_SmartEstimate_.latest")
print("  sector = CustomFundamentals.GICSSectorName.latest")

## 11. Verify Database

Query the database to verify data was loaded correctly.

In [None]:
# Connect and query
conn = sqlite3.connect(str(DB_PATH))

# Get row count
row_count = pd.read_sql("SELECT COUNT(*) as count FROM Price", conn).iloc[0, 0]
print(f"Total rows in database: {row_count:,}")

# Get date range
date_range = pd.read_sql("SELECT MIN(Date) as min_date, MAX(Date) as max_date FROM Price", conn)
print(f"Date range: {date_range.iloc[0, 0]} to {date_range.iloc[0, 1]}")

# Get symbol count
symbol_count = pd.read_sql("SELECT COUNT(DISTINCT Symbol) as count FROM Price", conn).iloc[0, 0]
print(f"Unique symbols: {symbol_count:,}")

# Show sample data for a specific symbol
print("\nSample data for AAPL:")
aapl_data = pd.read_sql("""
    SELECT Date, Symbol, RefPriceClose, CompanyMarketCap, 
           ReturnOnEquity_SmartEstimat, PriceTarget_Median
    FROM Price 
    WHERE Symbol = 'AAPL' 
    ORDER BY Date DESC 
    LIMIT 5
""", conn)
print(aapl_data)

print("\nSample data for IBM:")
ibm_data = pd.read_sql("""
    SELECT Date, Symbol, RefPriceClose, CompanyMarketCap, 
           ReturnOnEquity_SmartEstimat, GICSSectorName
    FROM Price 
    WHERE Symbol = 'IBM' 
    ORDER BY Date DESC 
    LIMIT 5
""", conn)
print(ibm_data)

conn.close()

print("\n✓ Database verification complete")

## 12. Usage Example

Example of how to use this database in a backtest.

In [None]:
print("To use this database in your backtests:")
print("\n1. Import the Database class:")
print("   from zipline.pipeline.data.db import Database, Column")
print("\n2. Define the CustomFundamentals class (from cell 10 above)")
print("\n3. Use in your pipeline:")
print("   ")
print("   def make_pipeline():")
print("       roe = CustomFundamentals.ReturnOnEquity_SmartEstimat.latest")
print("       growth = CustomFundamentals.LongTermGrowth_Mean.latest")
print("       sector = CustomFundamentals.GICSSectorName.latest")
print("       ")
print("       # Screen for quality companies")
print("       quality = (roe > 15) & (growth > 10)")
print("       ")
print("       return Pipeline(")
print("           columns={")
print("               'ROE': roe,")
print("               'Growth': growth,")
print("               'Sector': sector,")
print("           },")
print("           screen=quality")
print("       )")
print("\n4. The CustomSQLiteLoader will automatically load data based on CustomFundamentals.CODE")

print("\n✓ Setup complete! Your custom fundamentals database is ready to use.")

## Summary

This notebook:
1. ✅ Loaded CSV files with fundamental data
2. ✅ Mapped symbols to Zipline SIDs using the asset finder
3. ✅ Cleaned and prepared the data
4. ✅ Created a custom SQLite database in ~/.zipline/data/custom/
5. ✅ Defined a Database class for use in Pipeline
6. ✅ Verified the database contents

The database is now ready to use in your Zipline backtests with the CustomSQLiteLoader.

**Next steps:**
- See the examples below for using the data with Pipeline
- Copy the CustomFundamentals class definition to your backtest algorithm
- Use CustomFundamentals.ColumnName.latest in your pipeline
- The backtest_helpers.py will automatically detect and load the data

## 13. Pipeline Examples

Now let's demonstrate how to query and analyze the fundamentals data using Zipline Pipeline.

These examples show:
- Creating a pipeline with custom fundamentals
- Running the pipeline over date ranges
- Filtering stocks by fundamental criteria
- Extracting time series data for specific symbols
- Combining multiple fundamental factors

## 13. Pipeline Examples

Now let's demonstrate how to query and analyze the fundamentals data using Zipline Pipeline.

These examples show:
- Creating a pipeline with custom fundamentals
- Running the pipeline over date ranges
- Filtering stocks by fundamental criteria
- Extracting time series data for specific symbols
- Combining multiple fundamental factors


### Example 1: Setup Pipeline Engine

First, we need to set up the Pipeline engine to load our custom data.

In [None]:
from zipline.pipeline import Pipeline
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.loaders import USEquityPricingLoader
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.domain import US_EQUITIES
from zipline.utils.calendar_utils import get_calendar
from pathlib import Path

# Import the custom loader from zipline
from zipline.data.custom import CustomSQLiteLoader

# Get the trading calendar
trading_calendar = get_calendar('NYSE')

# Cache loader instances so we return the same object for all columns from a dataset
_loader_cache = {}

# Set up the pipeline engine with our custom loaders
def get_pipeline_loader(column):
    """
    Pipeline loader factory that routes columns to appropriate loaders.
    Returns the same loader instance for all columns from the same dataset.
    """
    # Route custom fundamentals to CustomSQLiteLoader
    # Domain-bound datasets don't have CODE attribute, so check the dataset's __name__
    dataset = column.dataset
    
    # Check if this is our CustomFundamentals dataset
    # Domain-bound datasets have a __name__ attribute with the dataset class name
    dataset_name = getattr(dataset, '__name__', '')
    
    if 'CustomFundamentals' in dataset_name or 'CustomFundamentals' in str(dataset):
        # Return cached loader instance for this database
        cache_key = CustomFundamentals.CODE
        if cache_key not in _loader_cache:
            # Specify the correct database directory where we created the database
            db_dir = Path('/root/.zipline/data/custom')
            _loader_cache[cache_key] = CustomSQLiteLoader(
                db_code=CustomFundamentals.CODE,
                db_dir=db_dir
            )
        return _loader_cache[cache_key]
    
    # Route pricing data to bundle
    if column in USEquityPricing.columns:
        # Use cached pricing loader
        if 'pricing' not in _loader_cache:
            _loader_cache['pricing'] = USEquityPricingLoader(
                bundle_data.equity_daily_bar_reader, 
                bundle_data.adjustment_reader
            )
        return _loader_cache['pricing']
    
    raise ValueError(f"No loader for {column}")

# Create the pipeline engine
engine = SimplePipelineEngine(
    get_loader=get_pipeline_loader,
    asset_finder=asset_finder,
    default_domain=US_EQUITIES,
)

print("✓ Pipeline engine configured with custom fundamentals loader")
print(f"  Trading calendar: {trading_calendar.name}")
print(f"  Asset finder: {len(asset_finder.sids):,} securities")
print(f"  Database directory: {Path('/root/.zipline/data/custom')}")

### Example 2: Basic Pipeline - Get Latest Fundamentals

Create a simple pipeline to get the latest fundamentals for all stocks.

In [None]:
# Define a pipeline with market cap filter for top 100 stocks
def make_basic_pipeline():
    """
    Get latest fundamentals for top 100 stocks by market cap.
    This reduces sparse data issues.
    """
    # Get fundamentals
    roe = CustomFundamentals.ReturnOnEquity_SmartEstimat.latest
    roa = CustomFundamentals.ReturnOnAssets_SmartEstimate.latest
    market_cap = CustomFundamentals.CompanyMarketCap.latest
    price = CustomFundamentals.RefPriceClose.latest
    sector = CustomFundamentals.GICSSectorName.latest
    ev_to_ebitda = CustomFundamentals.EnterpriseValueToEBITDA_DailyTimeSeriesRatio_.latest
    
    # Screen for top 100 stocks by market cap
    # This eliminates sparse data issues with small/inactive stocks
    top_100_by_mcap = market_cap.top(100)
    
    return Pipeline(
        columns={
            'ROE': roe,
            'ROA': roa,
            'Market_Cap': market_cap,
            'Price': price,
            'Sector': sector,
            'EV_to_EBITDA': ev_to_ebitda,
        },
        screen=top_100_by_mcap,
    )

# Run the pipeline for a single date
# Get a recent valid trading session from the bundle (last 3 months)
pipeline = make_basic_pipeline()

# Use recent trading sessions (last 3 months of data)
# Note: sessions_in_range expects timezone-naive dates at midnight
end_search = pd.Timestamp.now().normalize()
start_search = (end_search - pd.DateOffset(months=3)).normalize()

sessions = trading_calendar.sessions_in_range(start_search, end_search)
start_date = sessions[-5]  # Use 5 days back from the end
end_date = start_date

print(f"Using date: {start_date.date()}")
print(f"Running pipeline with top 100 stocks by market cap filter...")

result = engine.run_pipeline(pipeline, start_date, end_date)

print(f"\n✓ Pipeline run complete")
print(f"  Date: {start_date.date()}")
print(f"  Stocks in universe: {len(result):,}")
print(f"  Sector breakdown:")
print(result['Sector'].value_counts())

print(f"\nTop 10 stocks by ROE:")
print(result.nlargest(10, 'ROE')[['ROE', 'ROA', 'Market_Cap', 'Sector']])

print(f"\nTop 10 stocks by Market Cap:")
top_mcap = result.nlargest(10, 'Market_Cap')[['Market_Cap', 'ROE', 'Price', 'Sector']]
top_mcap['Market_Cap_B'] = top_mcap['Market_Cap'] / 1e9  # Convert to billions
print(top_mcap[['Market_Cap_B', 'ROE', 'Price', 'Sector']])

### Example 3: Filtered Pipeline - Quality Stocks

Filter stocks based on fundamental criteria (e.g., high ROE, profitable, large cap).

In [None]:
def make_quality_pipeline():
    """
    Screen for quality stocks with strong fundamentals.
    Limited to top 500 by market cap to avoid sparse data.
    """
    # Get fundamentals
    roe = CustomFundamentals.ReturnOnEquity_SmartEstimat.latest
    roa = CustomFundamentals.ReturnOnAssets_SmartEstimate.latest
    market_cap = CustomFundamentals.CompanyMarketCap.latest
    growth = CustomFundamentals.LongTermGrowth_Mean.latest
    price_target = CustomFundamentals.PriceTarget_Median.latest
    current_price = CustomFundamentals.RefPriceClose.latest
    sector = CustomFundamentals.GICSSectorName.latest
    
    # Calculate upside potential
    upside = ((price_target - current_price) / current_price) * 100
    
    # First filter: top 500 stocks by market cap (reduces sparse data)
    top_500_by_mcap = market_cap.top(500)
    
    # Quality criteria (applied to top 500)
    quality_screen = (
        top_500_by_mcap &
        (roe > 15) &  # Strong return on equity
        (roa > 5) &   # Profitable
        (market_cap > 1_000_000_000) &  # Large cap ($1B+)
        (growth > 10) &  # Double-digit growth
        (upside > 10)  # At least 10% upside
    )
    
    return Pipeline(
        columns={
            'ROE': roe,
            'ROA': roa,
            'Market_Cap': market_cap,
            'Growth': growth,
            'Price': current_price,
            'Target': price_target,
            'Upside_%': upside,
            'Sector': sector,
        },
        screen=quality_screen,
    )

# Run the filtered pipeline
# Use a recent valid trading session (last 3 months)
# Note: sessions_in_range expects timezone-naive dates at midnight
end_search = pd.Timestamp.now().normalize()
start_search = (end_search - pd.DateOffset(months=3)).normalize()

sessions = trading_calendar.sessions_in_range(start_search, end_search)
start_date = sessions[-5]  # Use 5 days back from the end

print(f"Running quality screen on top 500 stocks by market cap...")
pipeline = make_quality_pipeline()
result = engine.run_pipeline(pipeline, start_date, start_date)

print(f"\n✓ Quality screen results:")
print(f"  Date: {start_date.date()}")
print(f"  Stocks passing screen: {len(result)}")

if len(result) > 0:
    print(f"  Sector breakdown:")
    print(result['Sector'].value_counts())
    
    print(f"\nTop 10 by upside potential:")
    top_upside = result.nlargest(10, 'Upside_%')[['ROE', 'Growth', 'Price', 'Target', 'Upside_%', 'Sector']]
    print(top_upside)
else:
    print("\n  No stocks passed the quality screen criteria.")
    print("  Try relaxing the filters (e.g., ROE > 10, Growth > 5)")

### Example 4: Time Series Data - Track Fundamentals Over Time

Get historical fundamental data for specific symbols to analyze trends.

In [None]:
# Define symbols to track
symbols = ['AAPL', 'MSFT', 'GOOGL']

# Get the assets
assets = [asset_finder.lookup_symbol(sym, as_of_date=None) for sym in symbols]

# Create pipeline
pipeline = Pipeline(
    columns={
        'ROE': CustomFundamentals.ReturnOnEquity_SmartEstimat.latest,
        'Market_Cap': CustomFundamentals.CompanyMarketCap.latest,
        'Price': CustomFundamentals.RefPriceClose.latest,
        'Growth': CustomFundamentals.LongTermGrowth_Mean.latest,
        'EV_EBITDA': CustomFundamentals.EnterpriseValueToEBITDA_DailyTimeSeriesRatio_.latest,
    }
)

# Run over a date range (last 2 months of available data)
# Get valid trading sessions from the calendar
# Note: sessions_in_range expects timezone-naive dates at midnight
end_search = pd.Timestamp.now().normalize()
start_search = (end_search - pd.DateOffset(months=3)).normalize()

sessions = trading_calendar.sessions_in_range(start_search, end_search)
end_date = sessions[-5]  # Use 5 days back from the end
start_date = (end_date - pd.DateOffset(months=2)).normalize()

# Ensure start_date is a valid trading session
start_date = trading_calendar.sessions_in_range(start_date, end_date)[0]

print(f"Date range: {start_date.date()} to {end_date.date()}")

result = engine.run_pipeline(pipeline, start_date, end_date)

print(f"✓ Time series data extracted")
print(f"  Period: {start_date.date()} to {end_date.date()}")
print(f"  Total observations: {len(result):,}")

# Filter to our symbols of interest
symbol_data = result[result.index.get_level_values(1).isin(assets)]

print(f"  Observations for {symbols}: {len(symbol_data):,}")

# Show AAPL time series
aapl_asset = assets[0]
aapl_data = symbol_data.loc[pd.IndexSlice[:, aapl_asset], :]

print(f"\nAAPL Fundamental Trends (last 10 observations):")
print(aapl_data.tail(10)[['ROE', 'Market_Cap', 'Price', 'Growth']])

### Example 5: Visualize Time Series - Plot Fundamental Trends

Create charts to visualize how fundamentals change over time.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Define symbols to track (re-define in case previous cell wasn't run)
symbols = ['AAPL', 'MSFT', 'GOOGL']
assets = [asset_finder.lookup_symbol(sym, as_of_date=None) for sym in symbols]

# Create pipeline if not already created in previous cell
if 'symbol_data' not in locals():
    print("Fetching time series data...")
    pipeline = Pipeline(
        columns={
            'ROE': CustomFundamentals.ReturnOnEquity_SmartEstimat.latest,
            'Market_Cap': CustomFundamentals.CompanyMarketCap.latest,
            'Price': CustomFundamentals.RefPriceClose.latest,
            'Growth': CustomFundamentals.LongTermGrowth_Mean.latest,
            'EV_EBITDA': CustomFundamentals.EnterpriseValueToEBITDA_DailyTimeSeriesRatio_.latest,
        }
    )
    
    # Get valid trading sessions (last 3 months, timezone-naive at midnight)
    end_search = pd.Timestamp.now().normalize()
    start_search = (end_search - pd.DateOffset(months=3)).normalize()
    
    sessions = trading_calendar.sessions_in_range(start_search, end_search)
    end_date = sessions[-5]
    start_date = (end_date - pd.DateOffset(months=2)).normalize()
    start_date = trading_calendar.sessions_in_range(start_date, end_date)[0]
    
    result = engine.run_pipeline(pipeline, start_date, end_date)
    symbol_data = result[result.index.get_level_values(1).isin(assets)]
    print(f"✓ Fetched {len(symbol_data):,} observations")

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Fundamental Trends: AAPL, MSFT, GOOGL', fontsize=16, fontweight='bold')

# Prepare data for each symbol
symbol_colors = {'AAPL': 'blue', 'MSFT': 'green', 'GOOGL': 'red'}

for idx, (symbol, asset) in enumerate(zip(symbols, assets)):
    sym_data = symbol_data.loc[pd.IndexSlice[:, asset], :]
    sym_data = sym_data.reset_index(names=['date', 'asset'])
    
    color = symbol_colors[symbol]
    
    # Plot 1: ROE over time
    axes[0, 0].plot(sym_data['date'], sym_data['ROE'], 
                    label=symbol, marker='o', color=color, alpha=0.7)
    
    # Plot 2: Market Cap over time
    axes[0, 1].plot(sym_data['date'], sym_data['Market_Cap'] / 1e9, 
                    label=symbol, marker='s', color=color, alpha=0.7)
    
    # Plot 3: Growth Rate over time
    axes[1, 0].plot(sym_data['date'], sym_data['Growth'], 
                    label=symbol, marker='^', color=color, alpha=0.7)
    
    # Plot 4: EV/EBITDA over time
    axes[1, 1].plot(sym_data['date'], sym_data['EV_EBITDA'], 
                    label=symbol, marker='D', color=color, alpha=0.7)

# Customize subplots
axes[0, 0].set_title('Return on Equity (%)', fontweight='bold')
axes[0, 0].set_ylabel('ROE (%)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].set_title('Market Capitalization', fontweight='bold')
axes[0, 1].set_ylabel('Market Cap ($B)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].set_title('Long-term Growth Rate', fontweight='bold')
axes[1, 0].set_ylabel('Growth (%)')
axes[1, 0].set_xlabel('Date')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].set_title('Enterprise Value / EBITDA', fontweight='bold')
axes[1, 1].set_ylabel('EV/EBITDA Ratio')
axes[1, 1].set_xlabel('Date')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# Format x-axis dates
for ax in axes.flat:
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

print("✓ Fundamental trends visualization complete")

### Pipeline Examples Summary

You now know how to:
- ✅ Set up a Pipeline engine with custom fundamentals
- ✅ Query latest fundamentals for all stocks
- ✅ Filter stocks using fundamental criteria
- ✅ Extract time series data for specific symbols
- ✅ Visualize fundamental trends over time

**Key takeaways:**
- Use `CustomFundamentals.ColumnName.latest` to access any fundamental metric
- Combine multiple metrics with boolean operators (`&`, `|`) for screening
- Run pipelines over date ranges to analyze trends
- Filter results by asset to focus on specific symbols
- Integrate with matplotlib for visualization

**Next steps:**
- Use these patterns in your backtesting algorithms
- Create custom factors combining multiple fundamentals
- Integrate with price data from USEquityPricing
- Build sophisticated stock selection strategies

## 14. Running Backtests with Custom Fundamentals

### Using the Strategy File

I've created a working strategy file: **`strategy_top5_roe.py`**

This strategy:
- ✅ Uses your custom Custom fundamentals
- ✅ Filters to top 100 stocks by market cap
- ✅ Selects top 5 stocks by ROE
- ✅ Rebalances weekly (every Monday)
- ✅ Equal weights (20% each)

### How to Run

**From terminal/command line:**
```bash
cd /notebooks
python strategy_top5_roe.py
```

**From Jupyter:**

In [None]:
# Run the strategy
import subprocess
import sys

result = subprocess.run(
    [sys.executable, 'strategy_top5_roe.py'],
    cwd='/notebooks',
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load saved results
results = pd.read_pickle('/notebooks/backtest_results.pkl')

# Plot results
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Portfolio value
axes[0].plot(results.index, results['portfolio_value'], linewidth=2)
axes[0].set_ylabel('Portfolio Value ($)', fontsize=12)
axes[0].set_title('Top 5 ROE Strategy Performance', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Number of positions
axes[1].plot(results.index, results['num_positions'], linewidth=2, color='orange')
axes[1].set_ylabel('Number of Positions', fontsize=12)
axes[1].set_xlabel('Date', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nPerformance Summary:")
print(f"  Final Value: ${results.portfolio_value.iloc[-1]:,.2f}")
print(f"  Total Return: {(results.portfolio_value.iloc[-1]/100000-1)*100:.2f}%")
print(f"  Avg Positions: {results.num_positions.mean():.1f}")

In [None]:
# Analyze with pyfolio
import pyfolio as pf

returns = results.returns
pf.create_simple_tear_sheet(returns)

### Customizing the Strategy

Edit `strategy_top5_roe.py` to:

**Change selection criteria:**
```python
# Select top 10 instead of top 5
top_10_roe = roe.top(10, mask=top_100_by_mcap)

# Use different fundamentals
roa = CustomFundamentals.ReturnOnAssets_SmartEstimate.latest
growth = CustomFundamentals.LongTermGrowth_Mean.latest

# Combine multiple factors
roe_z = roe.zscore(mask=top_100_by_mcap)
growth_z = growth.zscore(mask=top_100_by_mcap)
combined = (roe_z + growth_z) / 2
top_stocks = combined.top(5, mask=top_100_by_mcap)
```

**Change rebalancing frequency:**
```python
# Monthly rebalancing
schedule_function(
    rebalance,
    date_rules.month_start(),
    time_rules.market_open(hours=1),
)

# Daily rebalancing
schedule_function(
    rebalance,
    date_rules.every_day(),
    time_rules.market_open(hours=1),
)
```

**Change date range:**
```python
start = pd.Timestamp('2023-01-01', tz='UTC')
end = pd.Timestamp('2024-12-31', tz='UTC')
```