In [21]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests
import time
import numpy as np

In [34]:
# getiing the list of companies in DOW_30
def dow_30_companies_func():
    url = 'https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average'
    response = requests.get(url)
    data = response.text
    soup = BeautifulSoup(data, 'lxml')
    table = soup.findChildren('table')[1]
    rows = table.find_all('tr')
    all_cols = []
    for row in rows:
        cols=row.find_all('td')
        cols=[x.text.strip() for x in cols]
        all_cols.append(cols)
    doq_30_df = pd.DataFrame(all_cols, columns=['Company', 'Echange','Symbol','Industry','Date_Added','Notes'])
    doq_30_df.drop(doq_30_df.index[[0]], inplace = True)
    doq_30_df['Symbol'] = doq_30_df['Symbol'].str.replace('NYSE:', ' ')
    print('Dow 30 Comapnies have been scraped')
    return doq_30_df

In [23]:
def ear_call_trans(i,cookies):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
               'cookie': cookies}
    temp_list = []
    url = 'https://seekingalpha.com/symbol/'+str(i).strip() + '/earnings/transcripts'
    #print(url)
    response = requests.get(url,headers=headers)
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    for a in soup.find_all('a', string=re.compile('Earnings Call Transcript')):
        temp_list.append('https://seekingalpha.com' + a['href'])
    return temp_list

In [24]:
def get_links(doq_30_df,cookies):
    list_all = []
    for i in doq_30_df['Symbol'].tolist():
        res = ear_call_trans(i,cookies)
        list_all.append(res)
    df = pd.DataFrame(list_all)
    df = df.T
    df.columns= doq_30_df['Company'].tolist()[:]
    df = df.fillna('EMPTY')
    print('Companies Earning calls links have been scraped')
    return df

In [25]:
# Scarping the URL of each company to get the speaker and the paragraph

def get_each_link(link, cookies):
    str_para = ''
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
           'cookie': cookies}
    response = requests.get(link, headers=headers)
    data = response.text
    soup = BeautifulSoup(data, 'lxml')
    para = soup.find_all('p')
    return para

In [26]:
def get_qtr_year(link):
    qtr_year = []
    qtr_year.append(link[-40:-38])
    qtr_year.append(link[-37:-33])
    return qtr_year

In [27]:
def data_companies(df, cookies):
    list_a = []
    for col in df.columns:
        values = df[col].tolist()
        values[:] = [x for x in values if 'EMPTY' not in x]
        for link in values:
            qtr_year = get_qtr_year(link)
            qtr = qtr_year[0]
            year = qtr_year[1]
            paragraphs  = get_each_link(link, cookies)
            list_a.append([col,link,qtr,year,paragraphs]) 
            time.sleep(5)
    df_store = pd.DataFrame(list_a, columns = ['company','link', 'qtr','year','text']) 
    print('Companies Earning calls text have been scraped')
    return df_store

In [28]:
def rerun_companies(df, cookies):
    list_a = []
    pattern = r'[^.]*please\ enable\ Javascript\ and\ cookies\ in\ your\ browser'
    rerun_list = df[df['text'].str.match(pattern)]['link'].tolist()
    rerun_company_list =  df[df['text'].str.match(pattern)]['company'].tolist()
    for link in rerun_list:
        print(link)
        qtr_year = get_qtr_year(link)
        qtr = qtr_year[0]
        year = qtr_year[1]
        paragraphs  = get_each_link(link, cookies)
        list_a.append([link,qtr,year,paragraphs]) 
        time.sleep(5)
    df_temp = pd.DataFrame(list_a, columns = ['link', 'qtr','year','text']) 
    df_temp['company'] = rerun_company_list                                              
    df = df[~df['link'].isin(rerun_list)]
    df.append(df_temp)
    rerun_list = df[df['text'].str.match(pattern)]['link'].tolist()
    df = df[~df['link'].isin(rerun_list)]
    print('Failed to scrape companies have been scraped again')
    return df

In [None]:
def get_mappings(soup):
    mappings = []
    heading_on = False
    all_text = soup.find_all('p')
    heads_counter = 0
    heads_list = soup.find_all('strong')
    for i in range(len(all_text)):
        heads = all_text[i].find_all('strong')
        if len(heads)>0 and not heading_on:
            pair_id =  heads_list[heads_counter].text
            for each in all_text[i].find_all_next("p"):
                #pair_id =  each.text
                if len(each.find_all('strong')) == 0:
                    #print(pair_id+"-"+each.text)
                    mappings.append((pair_id,each.text))
                else:
                    heads_counter = heads_counter+1
                    ##print(each)
                    if len(each.find_all('span'))>0 and each.find_all('span')[0]['class'][0]=='question':
                        print('question')
                        mappings.append((heads_list[heads_counter].text,'question',each.text))
                    if len(each.find_all('span'))>0 and each.find_all('span')[0]['class'][0]=='answer':
                        print('answer')
                        mappings.append((heads_list[heads_counter].text,'answer',each.text))
                    break
    processed_intro_mappings = []
    q_and_a = False
    for each in range(len(mappings)):
        if len(mappings[each])==2 and not q_and_a:
            q_and_a = False
            processed_intro_mappings.append(mappings[each])
        if len(mappings[each])==3:
            q_and_a = True
            break
    processed_qa_mappings = []
    q_and_a = False
    for each in range(len(mappings)):
        if len(mappings[each])==2 and q_and_a:
            q_and_a = False
            #processed_qa_mappings.append(mappings[each])
        if len(mappings[each])==3:
            q_and_a = True
            processed_qa_mappings.append((mappings[each][0],mappings[each][1],mappings[each+1][1]))
            continue
    processed_conclusion_mappings = []
    q_and_a_start = False
    q_and_a_over =  False
    for each in reversed(range(len(mappings))):
        if len(mappings[each])==2 and not q_and_a_over:
            q_and_a_over = False
            processed_conclusion_mappings.append(mappings[each])
        if len(mappings[each])==3:
            q_and_a_over = True
            break
    processed_conclusion_mappings.reverse()
    
    return [processed_intro_mappings, processed_qa_mappings, processed_conclusion_mappings]
    
    list_all = []
for each in data.loc[:,'text']:
    soup = BeautifulSoup(each, 'html.parser')
    list_all.append(get_mappings(soup))

In [37]:
if __name__== "__main__":
    dow_30_companies = dow_30_companies_func() 
    cookies = input("Enter your cookies: ")
    #shiqi_cookie = 'machine_cookie=7952404426213; _gcl_au=1.1.1852272509.1569899012; _ga=GA1.2.1953747303.1569899012; _pxvid=0b7e782c-e3f8-11e9-b462-0242ac12000d; _fbp=fb.1.1569899012051.886186956; __adroll_fpc=aae1eb2e906a50f3949b56c1d8d42366-s2-1569899012247; h_px=1; __gads=ID=d63a506c6419e1b6:T=1569899012:S=ALNI_Ma0SgjcuKl1JA1c7RW99nLBYW7VRw; _gcl_aw=GCL.1570401017.Cj0KCQjwz8bsBRC6ARIsAEyNnvpAH7BQxZHxIKHWYLIWMAxz1p-SAGiDnoSVERLTrVsJIhr3fxTIqSsaApoAEALw_wcB; _gac_UA-142576245-1=1.1570401017.Cj0KCQjwz8bsBRC6ARIsAEyNnvpAH7BQxZHxIKHWYLIWMAxz1p-SAGiDnoSVERLTrVsJIhr3fxTIqSsaApoAEALw_wcB; _gid=GA1.2.1752854065.1570401017; __aaxsc=0; _pxff_tm=1; user_id=50729313; user_nick=shiqi; user_devices=; u_voc=; marketplace_author_slugs=; user_cookie_key=0; has_paid_subscription=false; user_perm=; sapu=101; user_remember_token=942f39e40b9d0a7ae094f54356c5178db00607bf; free_article=0; url_source_before_register=https%3A%2F%2Fseekingalpha.com%2Fsubscriptions; regsteps=vocation%2Cnewsletters%2Cstocks; _dc_gtm_UA-142576245-1=1; _gat_UA-142576245-1=1; portfolio_sort_type=a_z; __ar_v4=2EEQPRZIBZB7VIPPEX2IGK%3A20191005%3A1%7CULCHBRH4ZZGFXDWGQTC6RG%3A20190931%3A7%7CRFXAEISDJFDZDINVACZG6X%3A20190931%3A15%7CHWYEUMZG3RCB3IJESAMRSO%3A20190931%3A15%7CRQ5QC664UFDO7B7WUQLCWR%3A20190931%3A5%7CDZPINTYKVVC37LE5MJWGEE%3A20191005%3A1%7CF6X65CJ4K5E43AFRH5CGQD%3A20191005%3A1; gk_user_access=1**1570401330; gk_user_access_sign=ab3e4c0a5e8bfeae794ee871e69d658ad6543138; _igt=63111f8a-2cc0-4c8e-ef39-30312ad92a9f; _ig=23c932c9-df53-4b37-92ae-a42d6a8fd7e0; _hjid=1c813c65-9363-4ac1-bebb-f9ec98b3612b; _hjIncludedInSample=1; _px2=eyJ1IjoiOWFlMzZjMzAtZTg4OS0xMWU5LWIyZDYtZDE1MjkzZmI2ZTdiIiwidiI6IjBiN2U3ODJjLWUzZjgtMTFlOS1iNDYyLTAyNDJhYzEyMDAwZCIsInQiOjE1NzA0MDE4MzM0NTYsImgiOiIwYWY1MmNmMjMxMDVjODRiNGYyYmY3MWZhNWNiMTAxYzYwYzQ2MDVlOTQ0ZTI0MTExZDdhNTZjYmNiN2MyNTFlIn0=; _px=2aTrZ4YsWDPHKEWd8SkGSknh+MguaFO5v08Zp5FpuXihOKSBfCNW1kqz0Sh6TeY6yCxZB569T9wcqASHHLqO8w==:1000:ktdy5OvwfDY+9o/aY+fpibJo69IGspZ0rUiGyjqHr6mPvPrMzOeSncOiyE2DwR0n6XM9iFjSH45EQdDEtT7C7jywMISr/wig/QhIjCSIfjb6oN8OD5BSZIdDe9iOqI3pTIMDjAFBA8pN1OG2YClj7veSzw0gcFqQibMkTSUhXrlSWu0rtsixXA5AzmhXTic/oiIgfjYO+ArCArHmirIjyWPoomfbnd9sT/fxhC6Yha6uzK5qIw10BhAAUXUQU8oKhYP/hs+r3ZnuGOoyHPjtmg==; _pxde=f28ee823697b2a8c811f655dd2d1091c8bd93e6933895256adebef386d4e4bd0:eyJ0aW1lc3RhbXAiOjE1NzA0MDEzMzM0NTYsImZfa2IiOjB9; aasd=11%7C1570401017239'
    dow_30_companies = get_links(dow_30_companies, cookies)
    #dow_30_companies.to_csv('companies.txt')
    dow_30_companies = data_companies(dow_30_companies,cookies)
    #cookies = input("Enter your cookies for the rerun ")
    dow_30_companies.to_csv('companies.csv')
    dow_30_companies = rerun_companies(dow_30_companies, cookies)
    dow_30_companies.to_csv('companies.csv')

Dow 30 Comapnies have been scraped
Enter your cookies: machine_cookie=7952404426213; _gcl_au=1.1.1852272509.1569899012; _ga=GA1.2.1953747303.1569899012; _pxvid=0b7e782c-e3f8-11e9-b462-0242ac12000d; _fbp=fb.1.1569899012051.886186956; __adroll_fpc=aae1eb2e906a50f3949b56c1d8d42366-s2-1569899012247; h_px=1; __gads=ID=d63a506c6419e1b6:T=1569899012:S=ALNI_Ma0SgjcuKl1JA1c7RW99nLBYW7VRw; _gcl_aw=GCL.1570401017.Cj0KCQjwz8bsBRC6ARIsAEyNnvpAH7BQxZHxIKHWYLIWMAxz1p-SAGiDnoSVERLTrVsJIhr3fxTIqSsaApoAEALw_wcB; _gac_UA-142576245-1=1.1570401017.Cj0KCQjwz8bsBRC6ARIsAEyNnvpAH7BQxZHxIKHWYLIWMAxz1p-SAGiDnoSVERLTrVsJIhr3fxTIqSsaApoAEALw_wcB; _gid=GA1.2.1752854065.1570401017; __aaxsc=0; _pxff_tm=1; user_id=50729313; user_nick=shiqi; user_devices=; u_voc=; marketplace_author_slugs=; user_cookie_key=0; has_paid_subscription=false; user_perm=; sapu=101; user_remember_token=942f39e40b9d0a7ae094f54356c5178db00607bf; free_article=0; url_source_before_register=https%3A%2F%2Fseekingalpha.com%2Fsubscriptions; regsteps