# Add Sharadar Tickers Metadata to Custom Database

This notebook adds Sharadar ticker metadata (exchange, category, location, etc.) to the CustomFundamentals database so it can be used for pipeline filtering.

In [1]:
import pandas as pd
import sqlite3
from pathlib import Path
import numpy as np

## 1. Load Sharadar Tickers Metadata

In [2]:
# Load Sharadar tickers
tickers_path = '/root/.zipline/data/sharadar/2025-11-23T04;09;32.033611/fundamentals/tickers.h5'
tickers = pd.read_hdf(tickers_path, key='tickers')

print(f'Loaded {len(tickers)} tickers')
print(f'Columns: {tickers.columns.tolist()}')
tickers.head()

Loaded 60303 tickers
Columns: ['table', 'permaticker', 'ticker', 'name', 'exchange', 'isdelisted', 'category', 'cusips', 'siccode', 'sicsector', 'sicindustry', 'famasector', 'famaindustry', 'sector', 'industry', 'scalemarketcap', 'scalerevenue', 'relatedtickers', 'currency', 'location', 'lastupdated', 'firstadded', 'firstpricedate', 'lastpricedate', 'firstquarter', 'lastquarter', 'secfilings', 'companysite']


Unnamed: 0_level_0,table,permaticker,ticker,name,exchange,isdelisted,category,cusips,siccode,sicsector,...,currency,location,lastupdated,firstadded,firstpricedate,lastpricedate,firstquarter,lastquarter,secfilings,companysite
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,SFP,645772,IFLR,INNOVATOR INTERNATIONAL DEVELOPED MANAGED FLOO...,NYSEARCA,N,ETF,45784N387,,,...,USD,Illinois; U.S.A,2025-11-22,2025-11-20,2025-11-20,2025-11-21,,,https://www.sec.gov/cgi-bin/browse-edgar?actio...,
1,SFP,645771,CAIQ,CALAMOS NASDAQ AUTOCALLABLE INCOME ETF,NASDAQ,N,ETF,12811T530,,,...,USD,Illinois; U.S.A,2025-11-21,2025-11-20,2025-11-20,2025-11-21,,,https://www.sec.gov/cgi-bin/browse-edgar?actio...,
2,SFP,645770,ESBG,FIRST TRUST ENHANCED STOCKS BONDS & GOLD ETF,NYSEARCA,N,ETF,33739H200,,,...,USD,Illinois; U.S.A,2025-11-22,2025-11-20,2025-11-19,2025-11-21,,,https://www.sec.gov/cgi-bin/browse-edgar?actio...,
3,SFP,645769,TXXD,21SHARES 2X LONG DOGECOIN ETF,NASDAQ,N,ETF,53656G175,,,...,USD,Wisconsin; U.S.A,2025-11-21,2025-11-20,2025-11-20,2025-11-21,,,https://www.sec.gov/cgi-bin/browse-edgar?actio...,
4,SFP,645768,MNZL,MANZIL RUSSELL HALAL USA BROAD MARKET ETF,NASDAQ,N,ETF,02072Q317,,,...,USD,Pennsylvania; U.S.A,2025-11-21,2025-11-20,2025-11-19,2025-11-21,,,https://www.sec.gov/cgi-bin/browse-edgar?actio...,


## 2. Select Relevant Columns and Prepare Data

In [3]:
# Select columns we want for filtering
# Key columns:
# - ticker: symbol
# - exchange: NYSE, NASDAQ, NYSEARCA, etc.
# - category: Domestic Common Stock, ADR Common Stock, etc.
# - location: country/state
# - sector/industry: for additional filtering
# - scalemarketcap: market cap scale

cols = [
    'ticker',
    'exchange',
    'category',
    'location',
    'sector',
    'industry',
    'sicsector',
    'sicindustry',
    'scalemarketcap'
]

tickers_subset = tickers[cols].copy()
tickers_subset = tickers_subset.rename(columns={'ticker': 'Symbol'})

print(f'Selected {len(tickers_subset)} tickers')
print(f'\nExchange distribution:')
print(tickers_subset['exchange'].value_counts())
print(f'\nCategory distribution (top 10):')
print(tickers_subset['category'].value_counts().head(10))

tickers_subset.head()

Selected 60303 tickers

Exchange distribution:
exchange
NASDAQ      26970
NYSE        13381
None        12092
NYSEARCA     3775
NYSEMKT      2627
BATS         1441
OTC            12
INDEX           5
Name: count, dtype: int64

Category distribution (top 10):
category
Domestic Common Stock                    26649
Institutional Investor                   12092
ETF                                       6562
Domestic Common Stock Primary Class       4053
ADR Common Stock                          3384
Domestic Common Stock Warrant             1465
Domestic Common Stock Secondary Class     1182
Domestic Preferred Stock                  1115
CEF                                       1068
Canadian Common Stock                      662
Name: count, dtype: int64


Unnamed: 0_level_0,Symbol,exchange,category,location,sector,industry,sicsector,sicindustry,scalemarketcap
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,IFLR,NYSEARCA,ETF,Illinois; U.S.A,,,,,
1,CAIQ,NASDAQ,ETF,Illinois; U.S.A,,,,,
2,ESBG,NYSEARCA,ETF,Illinois; U.S.A,,,,,
3,TXXD,NASDAQ,ETF,Wisconsin; U.S.A,,,,,
4,MNZL,NASDAQ,ETF,Pennsylvania; U.S.A,,,,,


## 3. Create a Flag for ADR Stocks

In [4]:
# Create is_adr flag: True if category contains 'ADR'
tickers_subset['is_adr'] = tickers_subset['category'].str.contains('ADR', na=False)

print(f'ADR stocks: {tickers_subset["is_adr"].sum()}')
print(f'Non-ADR stocks: {(~tickers_subset["is_adr"]).sum()}')

# Show some examples
print('\nSample ADR stocks:')
print(tickers_subset[tickers_subset['is_adr']][['Symbol', 'exchange', 'category', 'location']].head())
print('\nSample non-ADR stocks:')
print(tickers_subset[~tickers_subset['is_adr']][['Symbol', 'exchange', 'category', 'location']].head())

ADR stocks: 4354
Non-ADR stocks: 55949

Sample ADR stocks:
      Symbol exchange          category   location
None                                              
20785   WSHP   NASDAQ  ADR Common Stock       None
20788   POAS  NYSEMKT  ADR Common Stock  Singapore
20790   AERO     NYSE  ADR Common Stock     Mexico
20816   AGCC   NASDAQ  ADR Common Stock       None
20818    NPT   NASDAQ  ADR Common Stock       None

Sample non-ADR stocks:
     Symbol  exchange category             location
None                                               
0      IFLR  NYSEARCA      ETF      Illinois; U.S.A
1      CAIQ    NASDAQ      ETF      Illinois; U.S.A
2      ESBG  NYSEARCA      ETF      Illinois; U.S.A
3      TXXD    NASDAQ      ETF     Wisconsin; U.S.A
4      MNZL    NASDAQ      ETF  Pennsylvania; U.S.A


## 4. Get Date Range from Price Table

In [5]:
# Connect to custom database
db_path = '/data/custom_databases/fundamentals.sqlite'
conn = sqlite3.connect(db_path)

# Get unique dates from Price table
dates_df = pd.read_sql('SELECT DISTINCT Date FROM Price ORDER BY Date', conn)
dates = dates_df['Date'].tolist()

print(f'Found {len(dates)} unique dates in Price table')
print(f'Date range: {min(dates)} to {max(dates)}')
print(f'\nTotal rows to create: {len(tickers_subset):,} tickers × {len(dates):,} dates = {len(tickers_subset) * len(dates):,} rows')

Found 4002 unique dates in Price table
Date range: 2009-12-24 to 2025-11-18

Total rows to create: 60,303 tickers × 4,002 dates = 241,332,606 rows


<cell_type>markdown</cell_type>## 4. Process in Chunks to Save Memory

In [None]:
# Drop table if exists
conn.execute('DROP TABLE IF EXISTS SharadarTickersDaily')
conn.commit()

# Process tickers in chunks to avoid memory issues
CHUNK_SIZE = 10000  # Process 1000 tickers at a time
n_chunks = int(np.ceil(len(tickers_subset) / CHUNK_SIZE))

print(f'Processing {len(tickers_subset)} tickers in {n_chunks} chunks of {CHUNK_SIZE}')
print('This will take a few minutes...')
print()

# Prepare dates dataframe once
dates_df = pd.DataFrame({'Date': dates, 'key': 0})

for chunk_idx in range(n_chunks):
    start_idx = chunk_idx * CHUNK_SIZE
    end_idx = min((chunk_idx + 1) * CHUNK_SIZE, len(tickers_subset))
    
    # Get chunk of tickers
    chunk = tickers_subset.iloc[start_idx:end_idx].copy()
    
    # Create cross-product with dates (memory efficient with merge)
    chunk['key'] = 0
    expanded_chunk = chunk.merge(dates_df, on='key').drop('key', axis=1)
    
    # Write to database
    if chunk_idx == 0:
        # First chunk: create table
        expanded_chunk.to_sql('SharadarTickersDaily', conn, index=False, if_exists='replace', chunksize=10000)
    else:
        # Subsequent chunks: append
        expanded_chunk.to_sql('SharadarTickersDaily', conn, index=False, if_exists='append', chunksize=10000)
    
    # Progress update
    rows_written = end_idx * len(dates)
    total_rows = len(tickers_subset) * len(dates)
    pct = (rows_written / total_rows) * 100
    print(f'Chunk {chunk_idx + 1}/{n_chunks}: Processed tickers {start_idx} to {end_idx} ({rows_written:,} / {total_rows:,} rows, {pct:.1f}%)')
    
    # Clean up
    del chunk, expanded_chunk

print('\nAll chunks written!')

<cell_type>markdown</cell_type>## 5. Create Indexes for Fast Queries

In [None]:
print('Creating indexes (this will take 5-10 minutes for 241M rows)...')
print('You can monitor progress in the database file size.')
print()

import time
start = time.time()

print('1/3: Creating Symbol index...')
conn.execute('CREATE INDEX IF NOT EXISTS idx_sharadar_daily_symbol ON SharadarTickersDaily(Symbol)')
conn.commit()
elapsed = time.time() - start
print(f'    ✓ Symbol index created ({elapsed:.1f}s)')

print('2/3: Creating Date index...')
conn.execute('CREATE INDEX IF NOT EXISTS idx_sharadar_daily_date ON SharadarTickersDaily(Date)')
conn.commit()
elapsed = time.time() - start
print(f'    ✓ Date index created ({elapsed:.1f}s total)')

print('3/3: Creating Symbol+Date composite index (this is the slowest)...')
conn.execute('CREATE INDEX IF NOT EXISTS idx_sharadar_daily_symbol_date ON SharadarTickersDaily(Symbol, Date)')
conn.commit()
elapsed = time.time() - start
print(f'    ✓ Symbol+Date composite index created ({elapsed:.1f}s total)')

print()
print(f'All indexes created in {elapsed/60:.1f} minutes!')

<cell_type>markdown</cell_type>## 6. Verify the Data

In [None]:
# Check row count
count = pd.read_sql('SELECT COUNT(*) as count FROM SharadarTickersDaily', conn)
print(f'Total rows in SharadarTickersDaily: {count["count"][0]:,}')

# Check unique symbols
symbol_count = pd.read_sql('SELECT COUNT(DISTINCT Symbol) as count FROM SharadarTickersDaily', conn)
print(f'Unique symbols: {symbol_count["count"][0]:,}')

# Check unique dates
date_count = pd.read_sql('SELECT COUNT(DISTINCT Date) as count FROM SharadarTickersDaily', conn)
print(f'Unique dates: {date_count["count"][0]:,}')

# Sample data for AAPL
print('\nSample data for AAPL:')
test_df = pd.read_sql(
    "SELECT * FROM SharadarTickersDaily WHERE Symbol = 'AAPL' ORDER BY Date LIMIT 10",
    conn
)
print(test_df)

# Check NYSE stocks
nyse_count = pd.read_sql(
    "SELECT COUNT(DISTINCT Symbol) as count FROM SharadarTickersDaily WHERE exchange = 'NYSE'",
    conn
)
print(f'\nNYSE stocks: {nyse_count["count"][0]:,}')

# Check Domestic Common Stock
domestic_count = pd.read_sql(
    "SELECT COUNT(DISTINCT Symbol) as count FROM SharadarTickersDaily WHERE category = 'Domestic Common Stock'",
    conn
)
print(f'Domestic Common Stock: {domestic_count["count"][0]:,}')

# Check ADRs
adr_count = pd.read_sql(
    "SELECT COUNT(DISTINCT Symbol) as count FROM SharadarTickersDaily WHERE is_adr = 1",
    conn
)
print(f'ADR stocks: {adr_count["count"][0]:,}')

conn.close()

<cell_type>markdown</cell_type>## 7. Summary

The `SharadarTickersDaily` table is now available in your custom database with the following columns:

- **Symbol**: Ticker symbol
- **Date**: Date (matches dates in Price table)
- **exchange**: NYSE, NASDAQ, NYSEARCA, NYSEMKT, BATS, etc.
- **category**: Domestic Common Stock, ADR Common Stock, ETF, etc.
- **is_adr**: Boolean flag for ADR stocks (1=True, 0=False)
- **location**: Country/state
- **sector**: Sharadar sector
- **industry**: Sharadar industry
- **sicsector**: SIC sector
- **sicindustry**: SIC industry
- **scalemarketcap**: Market cap scale

You can now create a Pipeline Database class for this table and use it for filtering:

```python
from zipline.pipeline import Database, Column

class SharadarTickers(Database):
    CODE = "fundamentals"
    LOOKBACK_WINDOW = 1
    
    exchange = Column(str)
    category = Column(str)
    is_adr = Column(bool)
    location = Column(str)
    sector = Column(str)
    industry = Column(str)
    sicsector = Column(str)
    sicindustry = Column(str)
    scalemarketcap = Column(str)

# In your pipeline:
exchange = SharadarTickers.exchange.latest
category = SharadarTickers.category.latest
is_adr = SharadarTickers.is_adr.latest

base_universe = (
    exchange.in_(['NYSE', 'NASDAQ', 'NYSEMKT']) &
    (category == 'Domestic Common Stock') &
    ~is_adr
)
```