In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np
import os
import re
import requests
import json
import yfinance as yf
from time import sleep
from tqdm import tqdm
from datetime import datetime as dt
import random


Inputs

In [2]:
# Initialize years and quarters we wish to run the strategy for, choosing the last 10 years
years = range(2015, 2026)
quarters = range(1, 5)
year_quarters = [f"{year}-year/{quarter}-quarter" for year in years for quarter in quarters]

In [26]:
"""
Construct a custom 50-stock universe from historical S&P 500 constituents.

Methodology
-----------
Goal:
    Build a mid-cap-oriented test universe for transcript-based signal backtesting,
    restricted to the top 2 performing sectors (identified from prior Sharpe analysis).

Steps:
1. Data Source:
    - Start with S&P 500 constituents (Wikipedia table).
    - Keep only tickers added on/before 2015-12-31 (ensures membership in 2015).
    - NOTE: Wikipedia doesn’t list removal dates; all tickers are marked “Present”
      unless using CRSP/Compustat for point-in-time accuracy.

2. Market Cap Screen:
    - Fetch current float-adjusted market capitalization from Yahoo Finance.
    - Exclude any ticker with current cap > $50B.

3. Sector Tagging:
    - Use Yahoo Finance “sector” field to assign sectors.
    - Restrict universe to the top 2 chosen sectors only
      (e.g., Financials + Utilities/Insurance, and Information Technology/Industrials).

4. Sector Balance:
    - Cap each of the 2 chosen sectors at ≤25 names (configurable).
    - Ensures roughly even representation, but total = 50.

5. Final Sampling:
    - Randomly sample exactly 50 tickers from this restricted pool.
    - Random seed fixed for reproducibility.
    - Output includes Ticker, Company, Sector, S&P_Start, S&P_End, CurMktCap.

Caveats:
    - Same as before: approximate membership dates, current market cap proxy.
"""

import pandas as pd, yfinance as yf

MARKET_CAP_CUTOFF = 50e9
SAMPLE_SIZE = 50
SECTOR_CAP = 25   # since we’re only using 2 sectors
RANDOM_SEED = 42

# assume you already have `filtered` DataFrame
# columns: Ticker, Company, S&P_Start, S&P_End, CurMktCap

def fetch_sector(ticker):
    try:
        t = yf.Ticker(ticker)
        return t.info.get('sector')
    except Exception:
        return None

# 1) Add sector info
filtered['Sector'] = filtered['Ticker'].apply(fetch_sector)

# 2) Apply market cap cutoff
filtered = filtered[filtered['CurMktCap'].fillna(0) <= MARKET_CAP_CUTOFF]

# 3) Restrict to the 2 best sectors (edit these strings as you confirm them)
TOP_SECTORS = ['Financial Services', 'Utilities', 'Consumer Cyclical']
# If you want Tech/Industrials instead: ['Technology', 'Industrials']

restricted = filtered[filtered['Sector'].isin(TOP_SECTORS)]

# 4) Cap each sector at ≤25 names
balanced_pool = (
    restricted.groupby('Sector', group_keys=False)
    .apply(lambda g: g.sample(min(len(g), SECTOR_CAP), random_state=RANDOM_SEED))
)

# 5) Final 50-stock sample
if len(balanced_pool) < SAMPLE_SIZE:
    raise ValueError(f"Only {len(balanced_pool)} names in the top 2 sectors; "
                     f"cannot sample {SAMPLE_SIZE}. Increase SECTOR_CAP or relax filters.")
out = (balanced_pool
       .drop_duplicates('Ticker')
       .sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)
       .sort_values('Ticker')
       .reset_index(drop=True)
       .loc[:, ['Ticker','Company','Sector','S&P_Start','S&P_End','CurMktCap']])

# Save + show
out.to_csv("spx2015_top3sectors_50.csv", index=False)
print(out.to_string(index=False))


Ticker                         Company             Sector  S&P_Start S&P_End   CurMktCap
   AES                 AES Corporation          Utilities 1998-10-02 Present  9605554176
   AIG    American International Group Financial Services 1980-03-31 Present 45852139520
   AIZ                        Assurant Financial Services 2007-04-10 Present 10795241472
   AMP            Ameriprise Financial Financial Services 2005-10-03 Present 48216150016
  APTV                           Aptiv  Consumer Cyclical 2012-12-24 Present 17170375680
   AVY                  Avery Dennison  Consumer Cyclical 1987-12-31 Present 13707606016
   BBY                        Best Buy  Consumer Cyclical 1999-06-29 Present 15713649664
   BEN              Franklin Resources Financial Services 1998-04-30 Present 13171978240
   CCL                        Carnival  Consumer Cyclical 1998-12-22 Present 42357690368
   CMS                      CMS Energy          Utilities 1957-03-04 Present 21614981120
   CNP              C

  .apply(lambda g: g.sample(min(len(g), SECTOR_CAP), random_state=RANDOM_SEED))


In [23]:
# """
# Construct a custom 50-stock universe from historical S&P 500 constituents.

# Methodology
# -----------
# Goal:
#     Build a representative, mid-cap-oriented test universe for backtesting
#     transcript-based signals, without distortion from mega-caps like AAPL/MSFT.

# Steps:
# 1. Data Source:
#     - Start with the S&P 500 constituents table from Wikipedia.
#     - Keep only tickers with "Date Added" <= 2015-12-31, which ensures the
#       company was a member of the index at some point during 2015.
#     - NOTE: Wikipedia only lists the date a company joined the index; it does
#       not provide removal dates. For simplicity, all tickers are treated as
#       “Present” members unless removed manually. For point-in-time accuracy,
#       replace with CRSP/Compustat (WRDS) data if available.

# 2. Market Cap Screen:
#     - Fetch current float-adjusted market capitalization from Yahoo Finance.
#     - Exclude any ticker with current cap > $50B (configurable).
#     - Rationale: although all S&P 500 members are technically “large-cap,”
#       this filter strips out long-standing mega-caps and approximates a
#       mid-cap cohort more sensitive to sentiment shifts.

# 3. Sector Tagging:
#     - Retrieve each company’s GICS sector from Yahoo Finance.
#     - Attach sector labels to allow balancing across industries.

# 4. Sector Balance:
#     - Cap each sector at a maximum of 10 names (configurable).
#     - This prevents over-representation from high-density categories such as
#       Utilities or REITs.
#     - Optionally, one could enforce a minimum per sector, but in this version
#       some sectors may be absent if they had <1 eligible ticker under $50B.

# 5. Final Sampling:
#     - Randomly sample exactly 50 tickers from the capped pool.
#     - Random seed is fixed (42) to ensure reproducibility.
#     - Output includes: Ticker, Company, Sector, S&P_Start (date added),
#       S&P_End (“Present”), and current market cap.

# Caveats:
#     - Membership dates are approximate (based only on “Date Added”).
#     - End dates are not captured unless using WRDS/CRSP.
#     - Market cap screen uses *current* capitalization as a proxy; companies
#       that briefly crossed $50B in the past but are below today remain in.
#     - If fewer than 50 names survive filtering, the script raises an error.

# Use Cases:
#     - Backtesting sentiment/earnings call transcript-based strategies without FAANG bias.
#     - Exploring cross-sector robustness of signals.
# """

# MARKET_CAP_CUTOFF = 50e9
# SAMPLE_SIZE = 50
# SECTOR_CAP = 10
# RANDOM_SEED = 42

# # assume you already have `filtered` DataFrame:
# # columns: Ticker, Company, S&P_Start, S&P_End, CurMktCap

# def fetch_sector(ticker):
#     try:
#         t = yf.Ticker(ticker)
#         sec = t.info.get('sector')
#         return sec
#     except Exception:
#         return None

# # 1) Add sector info
# filtered['Sector'] = filtered['Ticker'].apply(fetch_sector)

# # 2) Drop > $50B
# filtered = filtered[filtered['CurMktCap'].fillna(0) <= MARKET_CAP_CUTOFF]

# # 3) Cap each sector to ≤ 10 names
# balanced_pool = (
#     filtered.groupby('Sector', group_keys=False)
#     .apply(lambda g: g.sample(min(len(g), SECTOR_CAP), random_state=RANDOM_SEED))
# )

# # 4) From this capped pool, sample 50 tickers
# if len(balanced_pool) < SAMPLE_SIZE:
#     raise ValueError(f"Only {len(balanced_pool)} names after sector cap; "
#                      f"cannot sample {SAMPLE_SIZE}. Try increasing SECTOR_CAP.")
# out = (balanced_pool
#        .drop_duplicates('Ticker')
#        .sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)
#        .sort_values('Ticker')
#        .reset_index(drop=True)
#        .loc[:, ['Ticker','Company','Sector','S&P_Start','S&P_End','CurMktCap']])

# # 5) Save + show
# out.to_csv("spx2015_midcapish_50_balanced.csv", index=False)
# print(out.to_string(index=False))


Ticker                              Company                 Sector  S&P_Start S&P_End   CurMktCap
   ADM               Archer Daniels Midland     Consumer Defensive 1957-03-04 Present 30850592768
   AIG         American International Group     Financial Services 1980-03-31 Present 45852139520
  AKAM                  Akamai Technologies             Technology 2007-07-12 Present 10971895808
  ALLE                             Allegion            Industrials 2013-12-02 Present 14576735232
   AME                               Ametek            Industrials 2013-09-23 Present 42867372032
   AMP                 Ameriprise Financial     Financial Services 2005-10-03 Present 48216150016
   APA                      APA Corporation                 Energy 1997-07-28 Present  8000094720
  APTV                                Aptiv      Consumer Cyclical 2012-12-24 Present 17170375680
   AVY                       Avery Dennison      Consumer Cyclical 1987-12-31 Present 13707606016
   BAX              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['Sector'] = filtered['Ticker'].apply(fetch_sector)
  .apply(lambda g: g.sample(min(len(g), SECTOR_CAP), random_state=RANDOM_SEED))


Scrape Earnings Calls for Each Ticker

In [13]:
def get_earnings_call_text(url: str):
    """
    This function takes in a url and scrapes the earnings call text using a chrome driver.
    The scrape simply copies the entire visible text from the page and parses out all of the text between "Earnings Call Transcript" and "Footer"

    Inputs:
    url (str): URL string to scrape

    Outputs:
    Parsed earnings call text
    """
    options = Options()

    options.add_argument("--window-size=100,100")  # tiny window
    # options.add_argument("--headless=new")  # still invisible
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        driver.get(url)
        visible_text = driver.find_element("tag name", "body").text
        return visible_text.split('Earnings Call Transcript')[-1].split('Footer')[0]
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None
    finally:
        driver.quit()


def parse_quarter(quarter_str: str):
    """
    This function extracts the date from the year_quarter string

    Input: 
    quarter_str (str): year and quarter of earnings call
    
    Output: Timestamp representing the first day of the quarter
    """
    
    year, q = quarter_str.split("-year/")[0], quarter_str.split("/")[-1][0]
    month = {'1': 1, '2': 4, '3': 7, '4': 10}[q]
    
    return pd.Timestamp(f"{year}-{month:02d}-01")

In [14]:
def get_earnings_calls_ticker(ticker: str):
    """
    This function scrapes all earnings calls for a given ticker. If the calls have already been scraped, continue.

    Inputs:
    ticker (str): Ticker name
    """

    print(f"scraping earnings calls for {ticker}")
    
    # Base URL for earnings call transcripts
    url_base = f"https://www.roic.ai/quote/{ticker}/transcripts/"
    
    # Directory for saving ticker data 
    ticker_dir = f"./earnings_calls/{ticker}/"
    
    # File name for scraped earnings calls
    scraped_earnings_calls_dir = f"{ticker_dir}scraped_earnings_calls.csv"
    

    if not os.path.exists(scraped_earnings_calls_dir):
        print(f"{scraped_earnings_calls_dir} does not exist, scraping earnings calls for {ticker}")
        os.makedirs(ticker_dir)
    
        scraped_earnings_calls = [get_earnings_call_text(f"{url_base}{yq}") for yq in year_quarters]
    
        scraped_earnings_calls_df = pd.DataFrame([year_quarters, scraped_earnings_calls, ticker]).T
        scraped_earnings_calls_df.columns = ['year_quarter', 'earnings_call_raw_text', 'ticker']

        scraped_earnings_calls_df['date'] = scraped_earnings_calls_df['year_quarter'].apply(parse_quarter)
        
        scraped_earnings_calls_df.to_csv(f"{ticker_dir}scraped_earnings_calls.csv")
        # earnings_call_df = scraped_earnings_calls_df
    else:
        print(f"{scraped_earnings_calls_dir} exists")
        # earnings_call_df = pd.read_csv(f"{ticker_dir}scraped_earnings_calls.csv", index_col=0)

In [16]:
for ticker in out.Ticker.tolist():
    get_earnings_calls_ticker(ticker)

scraping earnings calls for VAC
./earnings_calls/VAC/scraped_earnings_calls.csv does not exist, scraping earnings calls for VAC
scraping earnings calls for SSD
./earnings_calls/SSD/scraped_earnings_calls.csv does not exist, scraping earnings calls for SSD
scraping earnings calls for PRI
./earnings_calls/PRI/scraped_earnings_calls.csv does not exist, scraping earnings calls for PRI
scraping earnings calls for CHH
./earnings_calls/CHH/scraped_earnings_calls.csv does not exist, scraping earnings calls for CHH
scraping earnings calls for HGV
./earnings_calls/HGV/scraped_earnings_calls.csv does not exist, scraping earnings calls for HGV
scraping earnings calls for UMBF
./earnings_calls/UMBF/scraped_earnings_calls.csv does not exist, scraping earnings calls for UMBF
scraping earnings calls for HLNE
./earnings_calls/HLNE/scraped_earnings_calls.csv does not exist, scraping earnings calls for HLNE
scraping earnings calls for HR
./earnings_calls/HR/scraped_earnings_calls.csv does not exist, scra