In [24]:
# Kerry Zhang
# 7/16/2023
# Objective: Scrape earnings call transcripts from roic.ai
# Notes: Earnings transcripts are divided by person with div class "p-3 rounded-lg false".

import os
import copy
import time
import pandas as pd
import numpy as np

from textblob import TextBlob
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

# Avoid throttling
options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')

In [25]:
OUTPUT_FOLDER = ""

TICKERS = ["ABT"]
YEARS = [2022]
QUARTERS = [1]

In [26]:
class text_to_disappear_and_return_new_text(object):
    def __init__(self, locator, initial_text):
        self.locator = locator
        self.initial_text = initial_text

    def __call__(self, driver):
        try:
            element = driver.find_element(*self.locator)
            element_text = element.text
            if self.initial_text not in element_text:
                return element_text
        except:
            # If element is not found or initial text still present, return False to continue waiting.
            return False

In [27]:
def scrape_transcript(ticker: list, year: list, quarter: list) -> tuple:
    url = "https://roic.ai/transcripts/{:}?y={:}&q={:}".format(*[ticker, year, quarter])
    
    # Set up Selenium WebDriver
    driver = webdriver.Chrome()  
    
    # Navigate to the website
    driver.get(url)
    
    # Wait until element is loaded
    wait = WebDriverWait(driver, 10)
    locator = (By.ID, "__next")
    initial_text = "Please wait for a while ..."
    
    #element = wait.until_not(EC.invisibility_of_element_located(locator))
    #element = wait.until(EC.visibility_of_element_located((By.ID, "__next")))
    element = wait.until(text_to_disappear_and_return_new_text(locator, initial_text))
    
    # WIP: Throttling
    #wait.until(ExpectedConditions.not(ExpectedConditions.textToBePresentInElement(element, "Completed successfully")));
    
    # Extract Text
    text = copy.deepcopy(element)
    
    driver.quit()
    
    return text, url

In [28]:
def clean_text(earnings_transcript: str) -> list:

    # WIP: Remove non-ASCII characters
    earnings_transcript = earnings_transcript.replace("â\x80\x99", "'").replace("â\x80\x98", "'").replace("â\x80\x93", "'")
    
    lst = earnings_transcript.splitlines()
    
    # Truncate beginning
    while lst and "· Earnings Call Transcript ·" not in lst[0]:
        lst.pop(0)
        
    # Truncate ending
    last_index = lst.index("Found an error or have an idea? Write us an email tosupport@roic.ai")
    lst = lst[:last_index]
    
    # Remove paragraph new line characters
    lst = [element for element in lst if len(element) > 0]
    
    return lst

In [29]:
def split_text(lst: list):
    
    def is_name(segment: str) -> bool:
        words_in_segment = segment.split()
    
        if (
            segment == "Operator" or 
            (
             len(words_in_segment) == 2 and 
             words_in_segment[0].istitle() and 
             words_in_segment[1].istitle()
            )
           ):
            return True
        else:
            return False
        
    def get_sentiment(text):
        analysis = TextBlob(text)
        sentiment_score = analysis.sentiment.polarity
        
        return sentiment_score
    
    # Earnings Transcript Metadata (NOT USED CURRENTLY)
    details = lst[0] 
    date = lst[1]
    
    # Store [Position, Caller, Speech, Total Length, Average Segment Length] 
    position = 0
    caller = ""
    speech = ""
    paragraph_len_list = []
    
    # Process segments
    data = []
    for segment in lst[2:]:
        
        if is_name(segment):
            
            # Store current data
            data.append([position, 
                         caller, 
                         speech, 
                         np.sum(paragraph_len_list), 
                         np.mean(paragraph_len_list), 
                         get_sentiment(speech)])
            
            # Reset for next caller
            position += 1
            caller = segment
            speech = ""
            paragraph_len_list = []
            
        else:
            speech += segment + " "
            paragraph_len_list.append(len(segment.split()))
            
    df = pd.DataFrame(data[1:], columns=["position", "name", "speech", "tot_len", "avg_len", "sentiment"])
    
    return df

In [30]:
for year in YEARS:
    for quarter in QUARTERS:
        for ticker in TICKERS:
            earnings_transcript, url = scrape_transcript(ticker, year, quarter)
            earnings_transcript = clean_text(earnings_transcript)
            earnings_transcript = split_text(earnings_transcript)
            
            print(earnings_transcript)
            
            earnings_transcript.to_csv("{:}_{:}Q{:}.csv".format(*[ticker, year, quarter]))
            
            # Temp. Force retry if contains string "Please wait for a while ..."
            time.sleep(7)

    position               name  \
0          1           Operator   
1          2  Scott Leinenweber   
2          3        Robert Ford   
3          4          Bob Funck   
4          5           Operator   
5          6      Robbie Marcus   
6          7        Robert Ford   
7          8      Robbie Marcus   
8          9          Bob Funck   
9         10      Robbie Marcus   
10        11           Operator   
11        12    Larry Biegelsen   
12        13        Robert Ford   
13        14    Larry Biegelsen   
14        15        Robert Ford   
15        16    Larry Biegelsen   
16        17           Operator   
17        18      Josh Jennings   
18        19        Robert Ford   
19        20      Josh Jennings   
20        21           Operator   
21        22     Joanne Wuensch   
22        23        Robert Ford   
23        24     Joanne Wuensch   
24        25        Robert Ford   
25        26     Joanne Wuensch   
26        27           Operator   
27        28        

# Take advantage of div separation via xpath (WIP)

In [6]:
# Set up Selenium WebDriver
driver = webdriver.Chrome()  

# Navigate to the website
driver.get("https://roic.ai/transcripts/ABT?y=2022&q=4")

# Find all <div> elements with the class "p-3 rounded-lq false"
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "p-3"))).click()
div_elements = driver.find_elements("xpath", '//*[@id="__next"]/div/main/div[3]/div/div[2]/div/div[2]')

# Scrape the text content of each matching <div> element
for div_element in div_elements:
    text = div_element.text
    print(text)

# Close the browser
driver.quit()

TimeoutException: Message: 
