# Data Extraction via Selenium and CIQ
### Text Mining on Earnings Calls during a Pandemic as a Means to Predict End-Of-The-Month Stock Performances
####  Olin School of Business <br> Jose Luis Rodriguez  <br> jlr@wustl.edu <br> Fall 2021

In [1]:
import os
import time
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [33]:
driver = webdriver.Chrome()

### HTML Page containing a list of Earning calls for the respective time period

### Open Page and redirect to login

In [None]:
login = 'https://www.capitaliq.spglobal.com'

In [None]:
driver.get(login)

In [None]:
html = '/html/hotels-rest-leisure-2021.html'
local_url = 'file:///' + os.getcwd() + html
driver.get(local_url)

In [None]:
names = driver.find_elements_by_xpath('//*/td[@data-colkey="name"]')
links = driver.find_elements_by_xpath('//*/td[@data-colkey="name"]/a')
dates = driver.find_elements_by_xpath('//*/td[@data-colkey="date"]')
trscs = driver.find_elements_by_xpath('//*/td[@data-colkey="transcript_type"]')
related = driver.find_elements_by_xpath('//*/td[@data-colkey="relatedEntities"]')
entities = {}
for n,nm in enumerate(names):
    d = {}
    d['date'] = pd.to_datetime(dates[n].text).date()
    d['time'] = pd.to_datetime(dates[n].text).time()
    d['doc_type'] = trscs[n].text.lower().split('-')[0].strip() + '_' + trscs[n].text.lower().split('-')[1].strip()
    d['doc_type'] = d['doc_type'].replace(' ', '_')
    d['related'] = related[n].text.split(')')[0].replace('(','')
    d['link'] = links[n].get_attribute('href')
    entities[d['link']] = d

### Single Link Workflow Word Doc Download Approach

In [None]:
key = list(entities.keys())[150]

In [None]:
driver.get(key)

In [None]:
buttons = driver.find_elements_by_tag_name('button')

In [None]:
buttons = {i.text.lower():i for i in buttons}

In [None]:
CLASS_NAME = buttons['download'].get_attribute('class').split(' ')[-1]

In [None]:
check = driver.find_element_by_xpath('//*/div[@class="ag-center-cols-clipper"]')

In [None]:
e = check.find_elements_by_xpath('//*/div[@role="gridcell"]')[5]

In [None]:
e.find_elements_by_tag_name('input')[1].get_attribute('type')

In [None]:
c = e.find_elements_by_tag_name('input')[1]
script = 'document.getElementById("' + c.get_attribute('id') + ").click()'
driver.execute_script(script)

In [None]:
driver.find_element_by_xpath('//*/div[@class="spg-modal-footer"]/div/button').click()

In [None]:
for n,entry in enumerate(entities):
    driver.get(entry['link'])
    time.sleep(8)
    buttons = driver.find_elements_by_tag_name('button')
    buttons = {i.text.lower():i for i in buttons}
    buttons['download'].click()
    time.sleep(15)
    cells = driver.find_element_by_xpath('//*/div[@class="ag-center-cols-clipper"]')
    cells = cells.find_elements_by_xpath('//*/div[@role="gridcell"]')
    tags = {i.text.lower().replace('\n','_'):i for i in cells}
    tags = tags['pdf_word'].find_elements_by_tag_name('input')
    input_id = {i.get_attribute("id").split('_')[1]:i.get_attribute("id") for i in tags}
    script = 'document.getElementById("' + input_id['WORD'] + '").click()'
    driver.execute_script(script)
    time.sleep(2)
    buttons = driver.find_elements_by_tag_name('button')
    buttons = {i.text.lower():i for i in buttons}
    buttons['download'].click()
    time.sleep(5)

## HTML Extract Approach

In [None]:
data = []
for key in entities.keys():
    driver.get(key)
    time.sleep(2)
    try:
        content = driver.find_element_by_class_name('json-rtf')
    except:
        time.sleep(20)
        content = driver.find_element_by_class_name('json-rtf')
    entities[key]['company'] = content.find_element_by_xpath('//*/header[@data-id="children.0"]').text
    entities[key]['speakers_info'] = {}
    transcript = content.find_elements_by_xpath('//*/section[@data-id="children.2"]')
    speakers = content.find_elements_by_xpath('//*/section/strong')
    participants = content.find_element_by_xpath('//*/section[@data-id="children.1"]')
    participants = participants.find_elements_by_tag_name('li')
    entities[key]['speakers_number'] = len(participants) -1

    for el in participants:
        el = el.text.split('\n')
        if len(el) > 1:
            name = el[0].upper().replace('.', '').replace(' ', '_')
            info = el[1]
        else:
            if ';' in el[0]:
                name = el[0].split(';')[0]
                info = el[0].split(';')[1]
            else:
                name = el[0].upper().replace('.', '').replace(' ', '_')
                info = name
        entities[key]['speakers_info'][name] = info

    for el in speakers:
        spk_name = el.text.upper().replace('.', '').replace(' ', '_')
        spk_sec = ".".join(el.get_attribute('data-id').split('.')[:4])
        spk_content = content.find_element_by_xpath('//*/section[@data-id="' + spk_sec + '"]').text
        if spk_name != 'OPERATOR':
            if spk_name not in entities[key].keys(): 
                entities[key][spk_name] = spk_content.split('\n')[1:]
            else:
                entities[key][spk_name].extend(spk_content.split('\n')[1:])
    data.append(entities[key])
    time.sleep(2)

## Transcripts Download

### Retail Restaurant and Leisure

In [34]:
driver.get('https://www.capitaliq.spglobal.com/web/client?auth=inherit#')
time.sleep(8)

In [35]:
inputs = {i.get_attribute('name'):i for i in driver.find_elements(by = By.TAG_NAME, value='input') 
 if i.get_attribute('name') != ''}
sq = 'return document.getElementsByClassName("login-content")[5].getElementsByTagName("button")'
button = driver.execute_script(sq)
user = inputs['username']
pwd = inputs['password']

user.send_keys('jrodriguez@midwestbankcentre.com')
pwd.send_keys('J@n@3!F3b')

button[0].click()
time.sleep(10)

In [21]:
html = '/html/hotels-rest-leisure-2020.html'
local_url = 'file:///' + os.getcwd() + html
driver.get(local_url)

In [22]:
names = driver.find_elements_by_xpath('//*/td[@data-colkey="name"]')
links = driver.find_elements_by_xpath('//*/td[@data-colkey="name"]/a')
dates = driver.find_elements_by_xpath('//*/td[@data-colkey="date"]')
trscs = driver.find_elements_by_xpath('//*/td[@data-colkey="transcript_type"]')
related = driver.find_elements_by_xpath('//*/td[@data-colkey="relatedEntities"]')
entities = {}
for n,nm in enumerate(names):
    d = {}
    d['date'] = pd.to_datetime(dates[n].text).date()
    d['time'] = pd.to_datetime(dates[n].text).time()
    d['doc_type'] = trscs[n].text.lower().split('-')[0].strip() + '_' + trscs[n].text.lower().split('-')[1].strip()
    d['doc_type'] = d['doc_type'].replace(' ', '_')
    d['related'] = related[n].text.split(')')[0].replace('(','')
    d['link'] = links[n].get_attribute('href')
    entities[d['link']] = d

In [25]:
#data = []

for key in entities.keys():
    if key in keys:
        continue
    driver.get(key)
    time.sleep(7)
    header = driver.find_element(by = By.TAG_NAME, value='main')
    header = header.find_element(by = By.TAG_NAME, value='header')
    pdf = header.get_attribute('data-display-format')
    if pdf == 'pdf':
        continue
    try:
        content = driver.find_element_by_class_name('json-rtf')
    except:
        try:
            time.sleep(15)
            content = driver.find_element_by_class_name('json-rtf')      
        except:
            main = driver.find_element(by=By.TAG_NAME, value='main')
            src_html = main.get_attribute('outerHTML')
            name = entities[key]['related'].replace(':','_')
            with open('html/' + name,'w') as f:
                f.write(src_html)
            continue
    
    entities[key]['company'] = content.find_element_by_xpath('//*/header[@data-id="children.0"]').text
    entities[key]['speakers_info'] = {}
    entities[key]['speakers_transcript'] = {}
    transcript = content.find_elements_by_xpath('//*/section[@data-id="children.2"]')
    speakers = content.find_elements_by_xpath('//*/section/strong')
    participants = content.find_element_by_xpath('//*/section[@data-id="children.1"]')
    participants = participants.find_elements_by_tag_name('li')
    entities[key]['speakers_number'] = len(participants) -1

    for el in participants:
        el = el.text.split('\n')
        if len(el) > 1:
            name = el[0].upper().replace('.', '').replace(' ', '_')
            info = el[1]
        else:
            if ';' in el[0]:
                name = el[0].split(';')[0]
                info = el[0].split(';')[1]
            else:
                name = el[0].upper().replace('.', '').replace(' ', '_')
                info = name
        entities[key]['speakers_info'][name] = info

    corpus = ""
    for el in speakers:
        spk_name = el.text.upper().replace('.', '').replace(' ', '_')
        spk_sec = ".".join(el.get_attribute('data-id').split('.')[:4])
        spk_content = content.find_element_by_xpath('//*/section[@data-id="' + spk_sec + '"]').text
        if spk_name != 'OPERATOR':
            if spk_name not in entities[key]['speakers_transcript'].keys(): 
                entities[key]['speakers_transcript'][spk_name] = spk_content.split('\n')[1:]
                corpus += "\n".join(spk_content.split('\n')[1:])
            else:
                entities[key]['speakers_transcript'][spk_name].extend(spk_content.split('\n')[1:])
                corpus += "\n".join(spk_content.split('\n')[1:])
    entities[key]['corpus'] = corpus
    data.append(entities[key])
    time.sleep(1)

In [26]:
hrl = []
for entity in data:    
    corpus = " "
    for key in entity['speakers_transcript'].keys():
        corpus += "\n".join(entity['speakers_transcript'][key])
    entity['corpus'] = corpus
    hrl.append(entity)

In [27]:
hrl_df = pd.DataFrame.from_dict(hrl)
hrl_df.shape

(352, 10)

In [29]:
hrl_df.to_csv('data/hrl_2020.csv', index = False)

### Transportation

In [50]:
src_path = os.getcwd()
html = '/html/transportation-2021.html'
local_url = 'file:///' + src_path + html
driver.get(local_url)

In [46]:
names = driver.find_elements_by_xpath('//*/td[@data-colkey="name"]')
links = driver.find_elements_by_xpath('//*/td[@data-colkey="name"]/a')
dates = driver.find_elements_by_xpath('//*/td[@data-colkey="date"]')
trscs = driver.find_elements_by_xpath('//*/td[@data-colkey="transcript_type"]')
related = driver.find_elements_by_xpath('//*/td[@data-colkey="relatedEntities"]')
entities = {}
for n,nm in enumerate(names):
    d = {}
    d['date'] = pd.to_datetime(dates[n].text).date()
    d['time'] = pd.to_datetime(dates[n].text).time()
    d['doc_type'] = trscs[n].text.lower().split('-')[0].strip() + '_' + trscs[n].text.lower().split('-')[1].strip()
    d['doc_type'] = d['doc_type'].replace(' ', '_')
    d['related'] = related[n].text.split(')')[0].replace('(','')
    d['link'] = links[n].get_attribute('href')
    entities[d['link']] = d

In [41]:
keys = set([i['link'] for i in data])

In [47]:
data = []

for key in entities.keys():
    driver.get(key)
    time.sleep(8)
    try:
        header = driver.find_element(by = By.TAG_NAME, value='main')
        header = header.find_element(by = By.TAG_NAME, value='header')
        pdf = header.get_attribute('data-display-format')
        if pdf == 'pdf':
            continue
        content = driver.find_element_by_class_name('json-rtf')
    except:
        try:
            time.sleep(15)
            content = driver.find_element_by_class_name('json-rtf')      
        except:
            main = driver.find_element(by=By.TAG_NAME, value='main')
            src_html = main.get_attribute('outerHTML')
            name = entities[key]['related'].replace(':','_')
            with open('html/' + name,'w') as f:
                f.write(src_html)
            continue
    
    entities[key]['company'] = content.find_element_by_xpath('//*/header[@data-id="children.0"]').text
    entities[key]['speakers_info'] = {}
    entities[key]['speakers_transcript'] = {}
    transcript = content.find_elements_by_xpath('//*/section[@data-id="children.2"]')
    speakers = content.find_elements_by_xpath('//*/section/strong')
    participants = content.find_element_by_xpath('//*/section[@data-id="children.1"]')
    participants = participants.find_elements_by_tag_name('li')
    entities[key]['speakers_number'] = len(participants) -1

    for el in participants:
        el = el.text.split('\n')
        if len(el) > 1:
            name = el[0].upper().replace('.', '').replace(' ', '_')
            info = el[1]
        else:
            if ';' in el[0]:
                name = el[0].split(';')[0]
                info = el[0].split(';')[1]
            else:
                name = el[0].upper().replace('.', '').replace(' ', '_')
                info = name
        entities[key]['speakers_info'][name] = info

    corpus = ""
    for el in speakers:
        spk_name = el.text.upper().replace('.', '').replace(' ', '_')
        spk_sec = ".".join(el.get_attribute('data-id').split('.')[:4])
        spk_content = content.find_element_by_xpath('//*/section[@data-id="' + spk_sec + '"]').text
        if spk_name != 'OPERATOR':
            if spk_name not in entities[key]['speakers_transcript'].keys(): 
                entities[key]['speakers_transcript'][spk_name] = spk_content.split('\n')[1:]
                corpus += "\n".join(spk_content.split('\n')[1:])
            else:
                entities[key]['speakers_transcript'][spk_name].extend(spk_content.split('\n')[1:])
                corpus += "\n".join(spk_content.split('\n')[1:])
    entities[key]['corpus'] = corpus
    data.append(entities[key])
    time.sleep(1)

In [48]:
len(data)

241

In [49]:
trs_df = pd.DataFrame.from_dict(data)
trs_df.to_csv('data/trs_2021.csv', index = False)