# Index Tickers Data Collection

This notebook collects ticker symbols from major indices (S&P 500, DOW, NASDAQ) along with their:
- Industry
- Sub-Industry  
- Market Cap

**Data Sources:**
- Ticker lists: Wikipedia (S&P 500, DOW, NASDAQ-100)
- Market data: yfinance (industry, market cap)


## 1. Install Required Packages

If `yfinance` is not installed, uncomment and run the cell below:


In [None]:
# Uncomment if yfinance is not installed:
# uv pip install yfinance

# Note: pandas.read_html requires html5lib or lxml
# If you get errors, install one of these:
# !pip install html5lib
# OR
# !pip install lxml


## 2. Imports


In [11]:
import yfinance as yf
import pandas as pd
import requests
from io import StringIO
import warnings
warnings.filterwarnings('ignore')



In [None]:
# ============================================================================
# 1. GET S&P 500, DOW, NASDAQ CONSTITUENTS
# ============================================================================

def get_sp500_constituents():
    """Get S&P 500 constituents from Wikipedia"""
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    response = requests.get(url)
    tables = pd.read_html(StringIO(response.text))
    df = tables[0]
    df = df[['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry']]
    df.rename(columns={'Symbol': 'Ticker', 'Security': 'Company'}, inplace=True)
    df['Index'] = 'S&P 500'
    return df

get_sp500_constituents()

ValueError: No tables found

## 3. Fetch Ticker Lists from Wikipedia


In [None]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

HTTPError: HTTP Error 403: Forbidden

In [None]:
def get_sp500_tickers() -> List[str]:
    """Fetch S&P 500 ticker symbols from Wikipedia"""
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    
    try:
        # pandas.read_html can read tables directly from URL
        tables = pd.read_html(url)
        # The first table is usually the constituents table
        df = tables[0]
        
        # Find the Symbol column (case-insensitive)
        symbol_col = [col for col in df.columns if 'symbol' in col.lower()][0]
        tickers = df[symbol_col].tolist()
        
    except Exception as e:
        print(f"  ⚠ Error fetching S&P 500: {e}")
        return []
    
    # Clean ticker symbols (remove dots, convert to uppercase)
    tickers = [str(ticker).replace('.', '-').strip().upper() for ticker in tickers]
    tickers = [t for t in tickers if t and len(t) <= 5]  # Filter valid tickers
    
    print(f"✓ Fetched {len(tickers)} S&P 500 tickers")
    return tickers


def get_dow_tickers() -> List[str]:
    """Fetch DOW 30 ticker symbols from Wikipedia"""
    url = "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"
    
    try:
        tables = pd.read_html(url)
        # Find the table with ticker symbols (usually first or second table)
        for df in tables:
            # Look for a column with 'symbol' or 'ticker' in the name
            symbol_cols = [col for col in df.columns if 'symbol' in col.lower() or 'ticker' in col.lower()]
            if symbol_cols:
                tickers = df[symbol_cols[0]].tolist()
                break
        else:
            # Fallback: use known DOW tickers
            tickers = ['AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 
                       'DIS', 'DOW', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 
                       'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 
                       'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT']
            print("⚠ Using fallback DOW ticker list")
    except Exception as e:
        print(f"  ⚠ Error fetching DOW: {e}, using fallback")
        tickers = ['AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 
                   'DIS', 'DOW', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 
                   'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 
                   'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT']
    
    # Clean ticker symbols
    tickers = [str(ticker).replace('.', '-').strip().upper() for ticker in tickers]
    tickers = [t for t in tickers if t and len(t) <= 5]  # Filter valid tickers
    
    print(f"✓ Fetched {len(tickers)} DOW tickers")
    return tickers


def get_nasdaq100_tickers() -> List[str]:
    """Fetch NASDAQ-100 ticker symbols from Wikipedia"""
    url = "https://en.wikipedia.org/wiki/NASDAQ-100"
    
    try:
        tables = pd.read_html(url)
        # Find the table with ticker symbols
        for df in tables:
            symbol_cols = [col for col in df.columns if 'symbol' in col.lower() or 'ticker' in col.lower()]
            if symbol_cols:
                tickers = df[symbol_cols[0]].tolist()
                break
        else:
            # If no symbol column found, try first column of first table
            if tables:
                tickers = tables[0].iloc[:, 0].tolist()
            else:
                print("⚠ Could not find NASDAQ-100 table")
                return []
    except Exception as e:
        print(f"  ⚠ Error fetching NASDAQ-100: {e}")
        return []
    
    # Clean ticker symbols
    tickers = [str(ticker).replace('.', '-').strip().upper() for ticker in tickers]
    tickers = [t for t in tickers if t and len(t) <= 5]
    
    print(f"✓ Fetched {len(tickers)} NASDAQ-100 tickers")
    return tickers


# Fetch all ticker lists
print("Fetching ticker lists from Wikipedia...")
sp500_tickers = get_sp500_tickers()
dow_tickers = get_dow_tickers()
nasdaq100_tickers = get_nasdaq100_tickers()

# Combine and deduplicate
all_tickers = list(set(sp500_tickers + dow_tickers + nasdaq100_tickers))
print(f"\n✓ Total unique tickers: {len(all_tickers)}")
print(f"  - S&P 500: {len(sp500_tickers)}")
print(f"  - DOW: {len(dow_tickers)}")
print(f"  - NASDAQ-100: {len(nasdaq100_tickers)}")


Fetching ticker lists from Wikipedia...
  ⚠ Error fetching S&P 500: HTTP Error 403: Forbidden
  ⚠ Error fetching DOW: HTTP Error 403: Forbidden, using fallback
✓ Fetched 30 DOW tickers
  ⚠ Error fetching NASDAQ-100: HTTP Error 403: Forbidden

✓ Total unique tickers: 30
  - S&P 500: 0
  - DOW: 30
  - NASDAQ-100: 0


## 4. Fetch Market Data (Industry, Sub-Industry, Market Cap)


In [None]:
def fetch_ticker_info(ticker: str) -> Dict:
    """Fetch industry, sub-industry, and market cap for a ticker"""
    try:
        stock = yf.Ticker(ticker)
        info = stock.info
        
        # Extract relevant information
        data = {
            'ticker': ticker,
            'company_name': info.get('longName', info.get('shortName', 'N/A')),
            'industry': info.get('industry', 'N/A'),
            'sub_industry': info.get('industryDisp', info.get('sector', 'N/A')),
            'sector': info.get('sector', 'N/A'),
            'market_cap': info.get('marketCap', None),
            'index_sp500': ticker in sp500_tickers,
            'index_dow': ticker in dow_tickers,
            'index_nasdaq100': ticker in nasdaq100_tickers,
        }
        
        return data
    except Exception as e:
        print(f"  ✗ Error fetching {ticker}: {e}")
        return {
            'ticker': ticker,
            'company_name': 'N/A',
            'industry': 'N/A',
            'sub_industry': 'N/A',
            'sector': 'N/A',
            'market_cap': None,
            'index_sp500': ticker in sp500_tickers,
            'index_dow': ticker in dow_tickers,
            'index_nasdaq100': ticker in nasdaq100_tickers,
        }


# Fetch data for all tickers
print(f"Fetching market data for {len(all_tickers)} tickers...")
print("This may take several minutes due to API rate limits...\n")

ticker_data = []
for i, ticker in enumerate(all_tickers, 1):
    if i % 10 == 0:
        print(f"Progress: {i}/{len(all_tickers)} ({i/len(all_tickers)*100:.1f}%)")
    
    data = fetch_ticker_info(ticker)
    ticker_data.append(data)
    
    # Rate limiting - be respectful to API
    time.sleep(0.1)

print(f"\n✓ Completed fetching data for {len(ticker_data)} tickers")


## 5. Create Final DataFrame


In [None]:
# Convert to DataFrame
df = pd.DataFrame(ticker_data)

# Format market cap (convert to billions for readability)
df['market_cap_billions'] = df['market_cap'].apply(
    lambda x: x / 1e9 if pd.notna(x) and x is not None else None
)

# Reorder columns for better readability
column_order = [
    'ticker',
    'company_name',
    'sector',
    'industry',
    'sub_industry',
    'market_cap',
    'market_cap_billions',
    'index_sp500',
    'index_dow',
    'index_nasdaq100'
]

df = df[column_order]

# Sort by market cap (descending)
df = df.sort_values('market_cap', ascending=False, na_last=True)

print(f"✓ Created dataframe with {len(df)} rows and {len(df.columns)} columns")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nFirst few rows:")
df.head(10)


## 6. Data Summary and Statistics


In [None]:
print("=== Data Summary ===\n")
print(f"Total tickers: {len(df)}")
print(f"Tickers with market cap data: {df['market_cap'].notna().sum()}")
print(f"Tickers with industry data: {df['industry'].ne('N/A').sum()}")
print(f"\nIndex breakdown:")
print(f"  - S&P 500: {df['index_sp500'].sum()}")
print(f"  - DOW: {df['index_dow'].sum()}")
print(f"  - NASDAQ-100: {df['index_nasdaq100'].sum()}")

print(f"\n=== Market Cap Statistics ===")
if df['market_cap'].notna().any():
    print(f"  Min: ${df['market_cap'].min()/1e9:.2f}B")
    print(f"  Max: ${df['market_cap'].max()/1e9:.2f}B")
    print(f"  Mean: ${df['market_cap'].mean()/1e9:.2f}B")
    print(f"  Median: ${df['market_cap'].median()/1e9:.2f}B")

print(f"\n=== Top 10 by Market Cap ===")
df[['ticker', 'company_name', 'industry', 'market_cap_billions']].head(10)


## 7. Industry Analysis


In [None]:
print("=== Industry Distribution ===")
industry_counts = df['industry'].value_counts()
print(f"\nTop 10 Industries:")
print(industry_counts.head(10))

print(f"\n=== Sector Distribution ===")
sector_counts = df['sector'].value_counts()
print(sector_counts)


## 8. Save to CSV (Optional)


In [None]:
# Uncomment to save the dataframe to CSV
# output_path = '../data/index_tickers_data.csv'
# df.to_csv(output_path, index=False)
# print(f"✓ Saved to {output_path}")
