# Add Sharadar Metadata to LSEG Fundamentals CSV

This notebook enriches your LSEG fundamentals CSV with Sharadar ticker metadata (exchange, category, ADR status, sector, industry, market cap scale, etc.) so that all information is available in a single database table for Pipeline filtering.

## Metadata Columns Added

The following columns will be added with `sharadar_` prefix:

- `sharadar_exchange`: Exchange (NYSE, NASDAQ, NYSEMKT, etc.)
- `sharadar_category`: Stock category (Domestic Common Stock, ADR, ETF, etc.)
- `sharadar_is_adr`: Boolean ADR flag (1=True, 0=False)
- `sharadar_location`: Company location (USA, etc.)
- `sharadar_sector`: Sharadar sector
- `sharadar_industry`: Sharadar industry
- `sharadar_sicsector`: SIC sector
- `sharadar_sicindustry`: SIC industry
- `sharadar_scalemarketcap`: Market cap scale (1-6: Nano to Mega)

## Author
Kamran Sokhanvari / Hidden Point Capital

## 1. Setup and Configuration

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
# Configuration
INPUT_CSV = '/data/csv/20091231_20251118.csv'  # Your LSEG fundamentals file
OUTPUT_CSV = '/data/csv/20091231_20251118_with_metad_fixed.csv'  # Output file
SHARADAR_BUNDLE = 'sharadar'  # Sharadar bundle name

print(f"Input CSV: {INPUT_CSV}")
print(f"Output CSV: {OUTPUT_CSV}")
print(f"Sharadar Bundle: {SHARADAR_BUNDLE}")

Input CSV: /data/csv/20091231_20251118.csv
Output CSV: /data/csv/20091231_20251118_with_metad_fixed.csv
Sharadar Bundle: sharadar


## 2. Load Sharadar Ticker Metadata (with Deduplication)

In [3]:
def load_sharadar_tickers(bundle_name='sharadar'):
    """
    Load Sharadar ticker metadata from the bundle.
    
    Parameters
    ----------
    bundle_name : str
        Name of the Sharadar bundle (default: 'sharadar')
    
    Returns
    -------
    pd.DataFrame
        DataFrame with columns: Symbol, exchange, category, location,
        sector, industry, sicsector, sicindustry, scalemarketcap, is_adr
    """
    print(f"Loading Sharadar tickers from bundle: {bundle_name}")
    
    # Find the most recent bundle ingestion
    bundle_dir = Path.home() / '.zipline' / 'data' / bundle_name
    
    if not bundle_dir.exists():
        # Try Docker path
        bundle_dir = Path('/root/.zipline/data') / bundle_name
    
    if not bundle_dir.exists():
        raise FileNotFoundError(
            f"Sharadar bundle '{bundle_name}' not found. "
            f"Please ingest the bundle first with: zipline ingest -b {bundle_name}"
        )
    
    # Get most recent ingestion
    ingestions = sorted([d for d in bundle_dir.iterdir() if d.is_dir()],
                       reverse=True)
    
    if not ingestions:
        raise FileNotFoundError(f"No ingestions found in {bundle_dir}")
    
    latest_ingestion = ingestions[0]
    tickers_file = latest_ingestion / 'fundamentals' / 'tickers.h5'
    
    if not tickers_file.exists():
        raise FileNotFoundError(f"Tickers file not found: {tickers_file}")
    
    print(f"Loading from: {tickers_file}")
    
    # Load tickers
    tickers = pd.read_hdf(tickers_file, key='tickers')
    
    print(f"Loaded {len(tickers)} tickers")
    
    # IMPORTANT: Deduplicate tickers - keep only active (non-delisted) entries
    # Sharadar has multiple entries per ticker (active + historical delisted)
    if 'isdelisted' in tickers.columns:
        # Prefer non-delisted entries
        tickers = tickers.sort_values('isdelisted')  # False comes before True
        tickers = tickers.drop_duplicates(subset='ticker', keep='first')
        print(f"After deduplication: {len(tickers)} unique tickers")
    
    # Select relevant columns
    metadata_cols = [
        'ticker', 'exchange', 'category', 'location',
        'sector', 'industry', 'sicsector', 'sicindustry',
        'scalemarketcap'
    ]
    
    # Keep only columns that exist
    available_cols = [col for col in metadata_cols if col in tickers.columns]
    tickers_subset = tickers[available_cols].copy()
    
    # Add is_adr flag
    if 'category' in tickers_subset.columns:
        tickers_subset['is_adr'] = tickers_subset['category'].str.contains(
            'ADR', na=False, case=False
        ).astype(int)
    else:
        tickers_subset['is_adr'] = 0
    
    # Rename ticker to Symbol for merging
    tickers_subset = tickers_subset.rename(columns={'ticker': 'Symbol'})
    
    return tickers_subset

In [4]:
# Load Sharadar metadata
sharadar_metadata = load_sharadar_tickers(SHARADAR_BUNDLE)

print(f"\nMetadata shape: {sharadar_metadata.shape}")
print(f"\nColumns: {list(sharadar_metadata.columns)}")
print(f"\nFirst 5 rows:")
sharadar_metadata.head()

Loading Sharadar tickers from bundle: sharadar
Loading from: /root/.zipline/data/sharadar/2025-11-23T04;09;32.033611/fundamentals/tickers.h5
Loaded 60303 tickers
After deduplication: 30801 unique tickers

Metadata shape: (30801, 10)

Columns: ['Symbol', 'exchange', 'category', 'location', 'sector', 'industry', 'sicsector', 'sicindustry', 'scalemarketcap', 'is_adr']

First 5 rows:


Unnamed: 0_level_0,Symbol,exchange,category,location,sector,industry,sicsector,sicindustry,scalemarketcap,is_adr
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,IFLR,NYSEARCA,ETF,Illinois; U.S.A,,,,,,0
28201,SILC,NASDAQ,ADR Common Stock,Israel,Technology,Communication Equipment,Manufacturing,Computer Communications Equipment,3 - Small,1
28199,AMRK,NASDAQ,Domestic Common Stock,California; U.S.A,Financial Services,Capital Markets,Wholesale Trade,Wholesale-Jewelry Watches Precious Stones & Me...,3 - Small,0
28198,ESEA,NASDAQ,ADR Common Stock,Greece,Industrials,Marine Shipping,Transportation Communications Electric Gas And...,Deep Sea Foreign Transportation Of Freight,3 - Small,1
28196,OVV,NYSE,Domestic Common Stock,Colorado; U.S.A,Energy,Oil & Gas E&P,Mining,Crude Petroleum & Natural Gas,5 - Large,0


## 3. Load LSEG Fundamentals CSV

In [5]:
# Load LSEG fundamentals
print(f"Loading LSEG fundamentals from: {INPUT_CSV}")
fundamentals = pd.read_csv(INPUT_CSV)

print(f"\nLoaded {len(fundamentals):,} rows, {len(fundamentals.columns)} columns")

if 'Date' in fundamentals.columns:
    print(f"Date range: {fundamentals['Date'].min()} to {fundamentals['Date'].max()}")
    print(f"Unique dates: {fundamentals['Date'].nunique():,}")

if 'Symbol' in fundamentals.columns:
    print(f"Unique symbols: {fundamentals['Symbol'].nunique():,}")

print(f"\nFirst 5 rows:")
fundamentals.head()

Loading LSEG fundamentals from: /data/csv/20091231_20251118.csv

Loaded 9,010,487 rows, 38 columns
Date range: 2008-10-16 to 2025-11-18
Unique dates: 4,012
Unique symbols: 4,440

First 5 rows:


Unnamed: 0,Date,Symbol,Instrument,RefPriceClose,RefVolume,CompanyCommonName,EnterpriseValue_DailyTimeSeries_,CompanyMarketCap,GICSSectorName,FOCFExDividends_Discrete,InterestExpense_NetofCapitalizedInterest,Debt_Total,EarningsPerShare_Actual,EarningsPerShare_SmartEstimate_prev_Q,EarningsPerShare_ActualSurprise,EarningsPerShare_SmartEstimate_current_Q,LongTermGrowth_Mean,PriceTarget_Median,CombinedAlphaModelSectorRank,CombinedAlphaModelSectorRankChange,CombinedAlphaModelRegionRank,EarningsQualityRegionRank_Current,EnterpriseValueToEBIT_DailyTimeSeriesRatio_,EnterpriseValueToEBITDA_DailyTimeSeriesRatio_,EnterpriseValueToSales_DailyTimeSeriesRatio_,Dividend_Per_Share_SmartEstimate,CashCashEquivalents_Total,ForwardPEG_DailyTimeSeriesRatio_,PriceEarningsToGrowthRatio_SmartEstimate_,Recommendation_Median_1_5_,ReturnOnEquity_SmartEstimat,ReturnOnAssets_SmartEstimate,ForwardPriceToCashFlowPerShare_DailyTimeSeriesRatio_,ForwardPriceToSalesPerShare_DailyTimeSeriesRatio_,ForwardEnterpriseValueToOperatingCashFlow_DailyTimeSeriesRatio_,GrossProfitMargin_ActualSurprise,Estpricegrowth_percent,TradeDate
0,2009-12-31,A,A.N,22.217753,,Agilent Technologies Inc,11257190000.0,10838190000.0,Health Care,183000000.0,21000000.0,2904000000.0,0.32,,37.404,,15.0,33.0,40.0,,41.0,40.0,32.535225,22.33569,2.512204,,2479000000.0,0.993966,,2.0,,,13.785078,1.609831,20.022743,4.081,0.485299,2009-12-31
1,2010-01-04,A,A.N,22.382223,894600.0,Agilent Technologies Inc,11337420000.0,10918420000.0,Health Care,183000000.0,21000000.0,2904000000.0,0.32,,37.404,,15.0,33.0,44.0,2.0,43.0,40.0,32.767107,22.494879,2.530109,,2479000000.0,0.999386,,2.0,,,13.85796,1.620728,20.123099,4.081,0.474384,2010-01-04
2,2010-01-05,A,A.N,22.139094,828100.0,Agilent Technologies Inc,11218820000.0,10799820000.0,Health Care,183000000.0,21000000.0,2904000000.0,0.32,,37.404,,15.0,33.0,42.0,-1.0,41.0,40.0,32.424325,22.259556,2.503641,,2479000000.0,0.988052,,2.0,,,13.700236,1.602871,19.902142,4.081,0.490576,2010-01-05
3,2010-01-06,A,A.N,22.060434,852400.0,Agilent Technologies Inc,11182320000.0,10763320000.0,Health Care,183000000.0,21000000.0,2904000000.0,0.32,,37.404,,15.0,33.0,43.0,2.0,42.0,40.0,32.318846,22.187144,2.495497,,2479000000.0,0.984066,,2.0,,,13.644399,1.597203,19.823539,4.081,0.495891,2010-01-06
4,2010-01-07,A,A.N,22.03183,603700.0,Agilent Technologies Inc,11168370000.0,10749370000.0,Health Care,183000000.0,21000000.0,2904000000.0,0.32,,37.404,,15.0,33.0,41.0,-1.0,40.0,40.0,32.278512,22.159454,2.492382,,2479000000.0,0.982315,,2.0,,,13.619564,1.594881,19.78842,4.081,0.497833,2010-01-07


## 4. Merge Metadata with Fundamentals

In [6]:
def add_metadata_to_fundamentals(fundamentals_df, metadata_df):
    """
    Add Sharadar metadata columns to fundamentals DataFrame.
    """
    print(f"\nMerging metadata...")
    print(f"Fundamentals shape: {fundamentals_df.shape}")
    print(f"Metadata shape: {metadata_df.shape}")
    
    # Prefix metadata columns (except Symbol)
    metadata_cols = [col for col in metadata_df.columns if col != 'Symbol']
    rename_dict = {col: f'sharadar_{col.lower()}' for col in metadata_cols}
    metadata_df_renamed = metadata_df.rename(columns=rename_dict)
    
    # Merge on Symbol (left join to keep all fundamental rows)
    merged = fundamentals_df.merge(
        metadata_df_renamed,
        on='Symbol',
        how='left'
    )
    
    print(f"Merged shape: {merged.shape}")
    
    # Report matching statistics
    matched_symbols = merged['sharadar_exchange'].notna().sum() if 'sharadar_exchange' in merged.columns else 0
    total_rows = len(merged)
    
    print(f"\nMatching statistics:")
    print(f"  Total rows: {total_rows:,}")
    print(f"  Rows with metadata: {matched_symbols:,} ({matched_symbols/total_rows*100:.1f}%)")
    
    # Fill missing metadata with defaults
    metadata_columns = [col for col in merged.columns if col.startswith('sharadar_')]
    for col in metadata_columns:
        if col == 'sharadar_is_adr':
            merged[col] = merged[col].fillna(0).astype(int)
        else:
            merged[col] = merged[col].fillna('')
    
    return merged

In [7]:
# Merge
enriched = add_metadata_to_fundamentals(fundamentals, sharadar_metadata)

print(f"\n{'='*80}")
print("ENRICHED DATA SUMMARY")
print(f"{'='*80}")
print(f"Original columns: {len(fundamentals.columns)}")
print(f"Enriched columns: {len(enriched.columns)}")
print(f"Metadata columns added: {len([c for c in enriched.columns if c.startswith('sharadar_')])}")


Merging metadata...
Fundamentals shape: (9010487, 38)
Metadata shape: (30801, 10)
Merged shape: (9010487, 47)

Matching statistics:
  Total rows: 9,010,487
  Rows with metadata: 8,616,181 (95.6%)

ENRICHED DATA SUMMARY
Original columns: 38
Enriched columns: 47
Metadata columns added: 9


## 5. Verify Row Count (Should Match Original)

In [8]:
print(f"Original rows: {len(fundamentals):,}")
print(f"Enriched rows: {len(enriched):,}")
print(f"\n✓ Row count {'MATCHES' if len(fundamentals) == len(enriched) else 'MISMATCH!'}")

if len(fundamentals) != len(enriched):
    print(f"\n⚠️  WARNING: Row count increased by {len(enriched) - len(fundamentals):,} rows!")
    print("This indicates duplicate ticker entries in Sharadar metadata.")
    print("Check the deduplication logic in load_sharadar_tickers()")

Original rows: 9,010,487
Enriched rows: 9,010,487

✓ Row count MATCHES


## 6. Preview Enriched Data

In [9]:
# Show preview with metadata columns
metadata_cols = [col for col in enriched.columns if col.startswith('sharadar_')]
preview_cols = ['Date', 'Symbol'] + metadata_cols
preview_cols = [col for col in preview_cols if col in enriched.columns]

print("=" * 80)
print("PREVIEW (first 10 rows with metadata):")
print("=" * 80)
print(enriched[preview_cols].head(10).to_string())

PREVIEW (first 10 rows with metadata):
         Date Symbol sharadar_exchange      sharadar_category  sharadar_location sharadar_sector       sharadar_industry sharadar_sicsector               sharadar_sicindustry sharadar_scalemarketcap  sharadar_is_adr
0  2009-12-31      A              NYSE  Domestic Common Stock  California; U.S.A      Healthcare  Diagnostics & Research      Manufacturing  Laboratory Analytical Instruments               5 - Large                0
1  2010-01-04      A              NYSE  Domestic Common Stock  California; U.S.A      Healthcare  Diagnostics & Research      Manufacturing  Laboratory Analytical Instruments               5 - Large                0
2  2010-01-05      A              NYSE  Domestic Common Stock  California; U.S.A      Healthcare  Diagnostics & Research      Manufacturing  Laboratory Analytical Instruments               5 - Large                0
3  2010-01-06      A              NYSE  Domestic Common Stock  California; U.S.A      Healthcare 

## 7. Save Enriched CSV

In [None]:
# Save enriched data
print(f"Saving enriched data to: {OUTPUT_CSV}")
enriched.to_csv(OUTPUT_CSV, index=False)

print(f"\n✓ Saved {len(enriched):,} rows to {OUTPUT_CSV}")

Saving enriched data to: /data/csv/20091231_20251118_with_metad_fixed.csv


## 8. Summary

In [None]:
print("\n" + "=" * 80)
print("FINAL SUMMARY")
print("=" * 80)
print(f"Original rows: {len(fundamentals):,}")
print(f"Enriched rows: {len(enriched):,}")
print(f"Original columns: {len(fundamentals.columns)}")
print(f"Enriched columns: {len(enriched.columns)}")
print(f"Metadata columns added: {len(metadata_cols)}")

print("\nNew metadata columns:")
for col in sorted(metadata_cols):
    print(f"  - {col}")

print("\n✓ Done!")