In [1]:
import pip
import sys
import signal
import time
import pickle
import os
import re
import json
import csv
import pandas as pd
import numpy as np
from functools import reduce
from operator import iconcat
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor

import chromedriver_autoinstaller
# chromedriver_autoinstaller.install()

load_dotenv()

True

In [2]:
# Package installer function to handle missing packages
def install(package):
    print(package + ' package for Python not found, pip installing now....')
    pip.main(['install', package])
    print(package + ' package has been successfully installed for Python\n Continuing Process...')

# Ensure beautifulsoup4 is installed
try:
    from bs4 import BeautifulSoup
except:
    install('beautifulsoup4')
    from bs4 import BeautifulSoup

# Ensure selenium is installed
try:
    from selenium import webdriver
except:
    install('selenium')
finally:
    from selenium import webdriver
    # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException, InvalidCookieDomainException

In [3]:
class MarketWatchETL:
    def __init__(self):
        self.driver = None
        self._service = Service(
                '/Users/jerryli/Downloads/chromedriver-mac-arm64/chromedriver'
            )
        self._opts = webdriver.ChromeOptions()
        self._base_url = 'https://www.marketwatch.com/investing/stock/'
        self._retries = 10
        self._MW_USER = os.getenv('MARKETWATCH_USER')
        self._MW_KEY = os.getenv('MARKETWATCH_KEY')

    def login(self):
        self._initiate_driver()
        self.driver.get(self._base_url)
        
        close_popup_icon = self.driver.find_element(By.XPATH, "//div[@id='cx-scrim']//div[@id='cx-scrim-wrapper']/button")
        close_popup_icon.click()
        
        profile_xpath = "//div[@class='profile logged-out']/label[@class='btn--text btn--profile j-toggle-label']"
        profile_icon = self.driver.find_element(By.XPATH, profile_xpath)
        self.driver.execute_script("arguments[0].click();", profile_icon)
        
        options_xpath = "//ul[@class='profile__menu j-menu-contents j-toggle--profile']/li"
        option_list = self.driver.find_elements(By.XPATH, options_xpath)
        option_list[1].click()
        
        try:
            # username_xpath = "//div[@class='input-icon-container']/input[@id='username']"
            username_input_box = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, 'username'))
            )
            # driver.execute_script("arguments[0].click();", username_input_box)
            # username_input_box.clear()
            username_input_box.send_keys(self._MW_USER)
        except TimeoutException as e:
            print(e)
            
        continue_xpath = "//button[@class='solid-button continue-submit dj-btn-primary group']"
        continue_btn = self.driver.find_element(By.XPATH, continue_xpath)
        self.driver.execute_script("arguments[0].click();", continue_btn)
        # continue_btn.click()
        
        try:
            pw_xpath = "//div[@id='password-login-card-container']//div[@class='input-icon-container']/input[@id='password-login-password']"
            pw_class = "password dj-input w-full pr-10"
            pw_input_box = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, pw_xpath))
            )
            # self.driver.execute_script("arguments[0].click();", pw_input_box)
            # pw_input_box.clear()
            self.driver.execute_script(
                f"""document.getElementsByClassName("{pw_class}")[0].setAttribute('value', '{self._MW_KEY}')""",
                pw_input_box
            )
            # pw_input_box.send_keys(self._MW_KEY)
        except TimeoutException as e:
            print(e)
            
        signin_xpath = "//button[@class='solid-button new-design basic-login-submit dj-btn-primary group w-full']"
        signin_btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, signin_xpath)))
        # signin_btn = self.driver.find_element(By.XPATH, signin_xpath)
        # self.driver.execute_script(
        #     """document.querySelector('button.solid-button.new-design.basic-login-submit.dj-btn-primary.group.w-full').click();"""
        #     )
        # self.driver.execute_script("arguments[0].click();", signin_btn)
        signin_btn.click()
        # self.driver.set_page_load_timeout(2)

    @staticmethod
    def _cleaned_key_data_object(ticker, raw_data):
        cleaned_data = {}
        raw_labels = raw_data['labels']
        raw_values = raw_data['values']
        i = 0
        for raw_label in raw_labels:
            raw_value = raw_values[i]
            cleaned_data.update({str(raw_label.get_text()): raw_value.get_text()})
            i += 1
        return {ticker: cleaned_data}
    
    @staticmethod
    def _cleaned_competitors_object(ticker, raw_data):
        cleaned_data = {}
        raw_names = raw_data['Name']
        raw_chgs = raw_data['Chg %']
        raw_mcs = raw_data['Market Cap']
        i = 0
        for raw_name in raw_names:
            raw_chg = raw_chgs[i]
            raw_mc = raw_mcs[i]
            cleaned_data.update({
                str(raw_name): {
                    'per_chg': raw_chg,
                    'market_cap': raw_mc
                }
            })
            i += 1
        return {ticker: cleaned_data}
    
    @staticmethod
    def _cleaned_financials_object(ticker, raw_data):
        cleaned_data = {}
        raw_item = raw_data['Item']
        raw_years = raw_data['Years']
        raw_values = raw_data['Values']
        
        item_unique = sorted(set(raw_item))
        year_unique = sorted(set(raw_years))

        for item in item_unique:
            year_val = {}
            for year in year_unique:
                year_val.update({year: raw_values['_'.join((item, year))]})
            cleaned_data.update({item: year_val})
        return {ticker: cleaned_data}

    def _initiate_driver(self):
        try:
            if self.driver is None:
                self.driver = webdriver.Chrome(
                    service=self._service, 
                    options=self._opts)
        except:
            print('***SETUP ERROR: The PhantomJS Web Driver is either not configured or incorrectly configured!***')
            sys.exit(1)
            
    def _get_site_url(self, ticker, **site_kwargs):
        _url = self._base_url + ticker
        if site_kwargs is not None:
            _sub_page = ''
            for k, v in site_kwargs.items():
                if 'page' in k:
                    _sub_page += '/' + str(v)
            _url += _sub_page
        return _url
        # self.driver.get(_url)
            
    def _scrape_key_data(self):
        raw_data_obj = {}
        labels, values = list(), list()
        i = 0
        while i < self._retries:
            try:
                # time.sleep(3)
                # html = self.driver.page_source
                html = self.driver.execute_script("return document.documentElement.outerHTML;")
                soup = BeautifulSoup(html, "html.parser")
                items = soup.find_all('li', class_="kv__item")
                for item in items:
                    labels.append(item.find('small', class_="label"))
                    values.append(item.find('span', class_="primary"))
                
                if labels and values:
                    raw_data_obj.update({'labels': labels})
                    raw_data_obj.update({'values': values})
                    break
                else:
                    i += 1
            except:
                i += 1
                continue
        if i == self._retries:
            print('Please check your internet connection!\nUnable to connect...')
            self.driver.service.process.send_signal(signal.SIGTERM)
            sys.exit(1)
        return raw_data_obj
    
    def _scrape_competitors(self):
        raw_data_obj = {}
        names, chgs, mktcap = list(), list(), list()
        i = 0
        while i < self._retries:
            try:
                # time.sleep(3)
                # html = self.driver.page_source
                html = self.driver.execute_script("return document.documentElement.outerHTML;")
                soup = BeautifulSoup(html, "html.parser")
                table = soup \
                            .find_all('div', class_="element element--table overflow--table Competitors")[0] \
                            .find('table')
                headers = table \
                            .find('thead') \
                            .find('tr', class_="table__row") \
                            .find_all('th')
                comp_name_lbl, per_chg_lbl, mkt_cap_lbl = [th.get_text() for th in headers]      
                rows = table \
                        .find('tbody') \
                        .find_all('tr')
                for td in rows:
                    comp_name, per_chg, mc = [text for text in td.get_text().split('\n') if text != '']
                    names.append(comp_name)
                    chgs.append(per_chg)
                    mktcap.append(mc)

                if names and chgs and mktcap:
                    raw_data_obj.update({comp_name_lbl: names})
                    raw_data_obj.update({per_chg_lbl: chgs})
                    raw_data_obj.update({mkt_cap_lbl: mktcap})
                    break
                else:
                    i += 1
            except:
                i += 1
                continue
        if i == self._retries:
            print('Please check your internet connection!\nUnable to connect...')
            self.driver.service.process.send_signal(signal.SIGTERM)
            sys.exit(1)
        return raw_data_obj

    def _scrape_financials(self, mode=None):
        def _fin_table_extraction(container, counter):
            _raw_data_obj = {}
            # table = soup \
            #             .find_all('div', class_="element element--table table--fixed financials")[0] \
            #             .find('table')
            table = container.find('table')
            headers = table \
                        .find('thead') \
                        .find('tr', class_="table__row") \
                        .find_all('th')
            _tmp_headers = [[_text for _text in th.get_text().split('\n') if _text != ''] for th in headers]
            item_lbl, _, *years_lbl = list(reduce(iconcat, _tmp_headers, []))
            rows = table \
                    .find('tbody') \
                    .find_all('tr')

            for td in rows:
                item_col_name, _, *item_vals = [text for text in td.get_text().split('\n') if text != '']
                
                assert len(years_lbl) == len(item_vals), 'Length of years ({}) is not the same as number of values ({})'.format(len(years_lbl), len(item_vals))
                for year, value in zip(years_lbl, item_vals):
                    item_col_name_list.append(item_col_name)
                    year_list.append(year)
                    year_val_dict.update({item_col_name + '_' + year: value})

            if item_col_name_list and year_list and year_val_dict:
                _raw_data_obj.update({item_lbl: item_col_name_list})
                _raw_data_obj.update({'Years': year_list})
                _raw_data_obj.update({'Values': year_val_dict})
            else:
                counter += 1
            return _raw_data_obj, counter
                
        temp_dict = {}
        year_list, item_col_name_list, year_val_dict = list(), list(), dict()
        i = 0
        # while i < self._retries:
        #     try:
        # time.sleep(2)
        # html = self.driver.page_source
        html = self.driver.execute_script("return document.documentElement.outerHTML;")
        soup = BeautifulSoup(html, "html.parser")
        table_container = soup.find_all('div', class_="element element--table table--fixed financials")
        
        if len(table_container) > 1:
            for container in table_container:
                raw_data_obj, i_new = _fin_table_extraction(container, i)
            if len(temp_dict) == 0:
                temp_dict.update(raw_data_obj)
            else:
                for k, v in temp_dict.items():
                    print(type(v))
                    if type(v) == list:
                        temp_dict[k] = v + raw_data_obj[k]
                    elif type(v) == dict:
                        temp_dict[k] = v | raw_data_obj[k]
                    
        else:
            temp_dict, i_new = _fin_table_extraction(table_container[0], i)
            
        # if i_new == i: break
            # except:
            #     i += 1
            #     continue
        # if i == self._retries:
            # print('Please check your internet connection!\nUnable to connect...')
            # self.driver.service.process.send_signal(signal.SIGTERM)
            # sys.exit(1)
        return temp_dict
        
    def get_competitor_data(self, ticker):
        url = self._get_site_url(ticker.lower())
        if self.driver.current_url != url:
            self.driver.get(url)
        _raw_comp_data = self._scrape_competitors()
        return self._cleaned_competitors_object(ticker.lower(), _raw_comp_data)
    
    def get_stock_key_data(self, ticker):
        url = self._get_site_url(ticker.lower())
        if self.driver.current_url != url:
            self.driver.get(url)
        _raw_key_data = self._scrape_key_data()
        return self._cleaned_key_data_object(ticker.lower(), _raw_key_data)
    
    def get_stock_financials(self, ticker, mode=None):
        pages = dict(page1='financials')
        match mode:
            case 'is_qt':
                pages |= dict(page2='income', page3='quarter')
            case 'bs':
                pages |= dict(page2='balance-sheet')
            case 'bs_qt':
                pages |= dict(page2='balance-sheet', page3='quarter')
            case 'cf':
                pages |= dict(page2='cash-flow')
            case 'cf_qt':
                pages |= dict(page2='cash-flow', page3='quarter')
            case _:
                pass
        url = self._get_site_url(ticker.lower(), **pages)
        if self.driver.current_url != url:
            self.driver.get(url)
        _raw_comp_data = self._scrape_financials()
        return self._cleaned_financials_object(ticker.lower(), _raw_comp_data)
    
    
def transform_financials(data, ticker, drop_col):
    tmp = data.copy()
    tmp_2 = tmp[ticker.lower()].apply(pd.Series).drop(drop_col, axis=1)
    tmp_2 = tmp_2.T.reset_index().rename({'index': 'Statement Items'}, axis=1)
    tmp_2['Company'] = ticker.upper()
    tmp_2_cols = tmp_2.columns.tolist()
    tmp_2_cols.insert(0, tmp_2_cols.pop())
    tmp_2 = tmp_2[tmp_2_cols]
    return tmp_2

def transform_comps(data, ticker):
    tmp = data.copy()
    tmp_2 = tmp[ticker.lower()] \
                .apply(pd.Series) \
                .reset_index() \
                .rename({
                    'index': 'Company', 
                    'per_chg': 'Percentage Change', 
                    'market_cap': 'Market Capitalization'
                }, axis=1)
    return tmp_2

def transform_key_data(data):
    tmp = data.copy()
    tmp_2 = tmp.T.reset_index().rename({'index': 'Company'}, axis=1)
    tmp_2['Company'] = tmp_2['Company'].str.upper()
    return tmp_2

def export_to_json(output_folder):
    tmp_filename = ['{ticker}_fin_is', '{ticker}_fin_is_qt', '{ticker}_fin_bs', '{ticker}_fin_bs_qt', '{ticker}_fin_cf', '{ticker}_fin_cf_qt', '{ticker}_comps', '{ticker}_key_data']
    tmp_proc_lists = [s.format(ticker='ticker') + '_proc' for s in tmp_filename]
    tmp_list_of_lists = [globals().get(v) for v in tmp_proc_lists]
    zipped_filename_procs = zip(tmp_filename, tmp_list_of_lists)
    for filename_templ, data_list in zipped_filename_procs:
        for data in data_list:
            for ticker, tmp in data.items():
                filename_full = filename_templ.format(ticker=ticker)
                tmp.to_json(os.path.join(output_folder, filename_full + '.json'), orient='records')

def import_to_df(ticker):
    tmp_filename = [f'{ticker}_fin_is', f'{ticker}_fin_is_qt', f'{ticker}_fin_bs', f'{ticker}_fin_bs_qt', f'{ticker}_fin_cf', f'{ticker}_fin_cf_qt', f'{ticker}_comps', f'{ticker}_key_data']
    ticker_fin_is, ticker_fin_is_qt, ticker_fin_bs, ticker_fin_bs_qt, ticker_fin_cf, ticker_fin_cf_qt, ticker_comps, ticker_key_data = None, None, None, None, None, None, None, None
    for filename in tmp_filename:
        with open(filename + '.json', 'r') as file:
            match filename:
                case filename if re.search('comps$', filename): ticker_comps = pd.read_json(file)
                case filename if re.search('key_data$', filename): ticker_key_data = pd.read_json(file)
                case filename if re.search('fin_is$', filename): ticker_fin_is = pd.read_json(file)
                case filename if re.search('fin_is_qt$', filename): ticker_fin_is_qt = pd.read_json(file)
                case filename if re.search('fin_bs$', filename): ticker_fin_bs = pd.read_json(file)
                case filename if re.search('fin_bs_qt$', filename): ticker_fin_bs_qt = pd.read_json(file)
                case filename if re.search('fin_cf$', filename): ticker_fin_cf = pd.read_json(file)
                case filename if re.search('fin_cf_qt$', filename): ticker_fin_cf_qt = pd.read_json(file)
    return ticker_comps, ticker_key_data, ticker_fin_is, ticker_fin_is_qt, ticker_fin_bs, ticker_fin_bs_qt, ticker_fin_cf, ticker_fin_cf_qt

In [4]:
# import symbols from MarketWatch watchlist csv output for scraping

batch_size = 5
symbols_list = list()
with open('marketWatchWatchlistExport.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if 'STOCK' in row['Charting Symbol']:
            symbols_list.append(row['Symbol'])

In [5]:
# mw = MarketWatchETL()
# mw.login()

In [6]:
def load_dump_cookies(mode, cookies=None, cookie_fname='mw_cookies_curr_sess.pkl'):
    if mode == 'dump':
        assert cookies is not None, 'No cookies present for saving.'
        with open(cookie_fname, 'wb') as f:
            pickle.dump(cookies, f)
        print('Cookies for current session saved.')
        return cookies
    else:
        assert os.path.exists(cookie_fname), 'Cookies file is not present for loading.'
        with open(cookie_fname, 'rb') as f:
            cookies = pickle.load(f)
        print('Cookies for current session loaded.')
        return cookies
    
def init_mw_scrapers(cookies=None, base_url=None):
    if cookies is None:
        scraper = MarketWatchETL()
        scraper.login()
        
        time.sleep(2)
        mw_cookies = load_dump_cookies('dump', cookies=scraper.driver.get_cookies())
        return scraper, mw_cookies
    else:
        assert cookies is not None, 'No cookies for current session located.'
        assert base_url is not None, 'No base URL provided.'
        scraper = MarketWatchETL()
        scraper._initiate_driver()

        scraper.driver.get(base_url)

        for cookie in cookies:
            scraper.driver.add_cookie(cookie)
        
        close_popup_icon = WebDriverWait(scraper.driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@id='cx-scrim']//div[@id='cx-scrim-wrapper']/button")))
        # close_popup_icon = scraper.driver.find_element(By.XPATH, "//div[@id='cx-scrim']//div[@id='cx-scrim-wrapper']/button")
        close_popup_icon.click()
        
        scraper.driver.get(base_url)
        return scraper
        
def clean_scrape_process(scrapers, cookie_fname='mw_cookies_curr_sess.pkl'):
    if len(scrapers) > 0:
        for s in scrapers:
            s.driver.quit()
            del s
    os.remove(cookie_fname)
    print('Scraper and cookies removed.')

In [7]:
def extract_scraper_data(symbols_list, scraper):
    _ticker_fin_is, _ticker_fin_is_qt, _ticker_fin_bs, _ticker_fin_bs_qt, _ticker_fin_cf, _ticker_fin_cf_qt, _ticker_comps, _ticker_key_data = [], [], [], [], [], [], [], []
    proc_track = {0: 'income statement', 1: 'income statement quarterly', 2: 'balance sheet annual', 3: 'balance sheet quarterly', 
                4: 'cash flow annually', 5: 'cash flow quarterly', 6: 'competitors', 7: 'key stock data'}
    
    for symbol in symbols_list:
        i = 0
        print(f'Running extraction for {symbol}...')
        try:
            _ticker_fin_is.append(scraper.get_stock_financials(symbol))
            print(f'Processed {proc_track[i]}')
            _ticker_fin_is_qt.append(scraper.get_stock_financials(symbol, mode='is_qt'))
            i += 1
            print(f'Processed {proc_track[i]}')
            _ticker_fin_bs.append(scraper.get_stock_financials(symbol, mode='bs'))
            i += 1
            print(f'Processed {proc_track[i]}')
            _ticker_fin_bs_qt.append(scraper.get_stock_financials(symbol, mode='bs_qt'))
            i += 1
            print(f'Processed {proc_track[i]}')
            _ticker_fin_cf.append(scraper.get_stock_financials(symbol, mode='cf'))
            i += 1
            print(f'Processed {proc_track[i]}')
            _ticker_fin_cf_qt.append(scraper.get_stock_financials(symbol, mode='cf_qt'))
            i += 1
            print(f'Processed {proc_track[i]}')
            _ticker_comps.append(scraper.get_competitor_data(symbol))
            i += 1
            print(f'Processed {proc_track[i]}')
            _ticker_key_data.append(scraper.get_stock_key_data(symbol))
            i += 1
            print(f'Processed {proc_track[i]}')
        except:
            print(f'Cannot extract for {symbol}.')
            continue

    return _ticker_fin_is, _ticker_fin_is_qt, _ticker_fin_bs, _ticker_fin_bs_qt, _ticker_fin_cf, _ticker_fin_cf_qt, _ticker_comps, _ticker_key_data

In [8]:
%%time

def run_extraction(symbols_list, num_scrapers):
    symbol_loop_list = np.array_split(np.array(symbols_list), num_scrapers)
    main_mw, mw_cookies = init_mw_scrapers()
    # mw_cookies = load_dump_cookies('load')
    # main_base_url = str(main_mw.driver.current_url)
    main_base_url = str(main_mw._base_url)
    
    add_scrapers_list = []
    try:
        add_scrapers_list = [main_mw]
        for _ in range(num_scrapers - 1):
            new_scraper = init_mw_scrapers(cookies=mw_cookies, base_url=main_base_url)
            add_scrapers_list.append(new_scraper)
        # add_scrapers_list = [init_mw_scrapers(cookies=mw_cookies, base_url=main_base_url) for _ in range(num_scrapers - 1)]
    except InvalidCookieDomainException as e:
        print(e)
        # clean_scrape_process(add_scrapers_list)
    
    assert len(symbol_loop_list) == len(add_scrapers_list), 'Unequal pairing of symbols list and scrapers.'
    _ticker_fin_is_list, _ticker_fin_is_qt_list, _ticker_fin_bs_list, _ticker_fin_bs_qt_list, _ticker_fin_cf_list, _ticker_fin_cf_qt_list, _ticker_comps_list, _ticker_key_data_list = [], [], [], [], [], [], [], []
    # for symbol_list, scraper in zip(symbol_loop_list, add_scrapers_list):
    #     (ticker_fin_is, 
    #      ticker_fin_is_qt, 
    #      ticker_fin_bs, 
    #      ticker_fin_bs_qt, 
    #      ticker_fin_cf, 
    #      ticker_fin_cf_qt, 
    #      ticker_comps, 
    #      ticker_key_data) = extract_scraper_data(symbol_list, scraper)
    #     _ticker_fin_is_list.append(ticker_fin_is)
    #     _ticker_fin_is_qt_list.append(ticker_fin_is_qt)
    #     _ticker_fin_bs_list.append(ticker_fin_bs)
    #     _ticker_fin_bs_qt_list.append(ticker_fin_bs_qt)
    #     _ticker_fin_cf_list.append(ticker_fin_cf)
    #     _ticker_fin_cf_qt_list.append(ticker_fin_cf_qt)
    #     _ticker_comps_list.append(ticker_comps)
    #     _ticker_key_data_list.append(ticker_key_data)
    
    futures = []
    with ThreadPoolExecutor(max_workers=6) as executor:
        for symbol_list, scraper in zip(symbol_loop_list, add_scrapers_list):
            futures.append(executor.submit(extract_scraper_data, symbol_list, scraper))
            
    for future in futures:
        (ticker_fin_is, 
         ticker_fin_is_qt, 
         ticker_fin_bs, 
         ticker_fin_bs_qt, 
         ticker_fin_cf, 
         ticker_fin_cf_qt, 
         ticker_comps, 
         ticker_key_data) = future.result()
        _ticker_fin_is_list.append(ticker_fin_is)
        _ticker_fin_is_qt_list.append(ticker_fin_is_qt)
        _ticker_fin_bs_list.append(ticker_fin_bs)
        _ticker_fin_bs_qt_list.append(ticker_fin_bs_qt)
        _ticker_fin_cf_list.append(ticker_fin_cf)
        _ticker_fin_cf_qt_list.append(ticker_fin_cf_qt)
        _ticker_comps_list.append(ticker_comps)
        _ticker_key_data_list.append(ticker_key_data)
        
    
    clean_scrape_process(add_scrapers_list)
    return _ticker_fin_is_list, _ticker_fin_is_qt_list, _ticker_fin_bs_list, _ticker_fin_bs_qt_list, _ticker_fin_cf_list, _ticker_fin_cf_qt_list, _ticker_comps_list, _ticker_key_data_list

(
    ticker_fin_is_list, 
    ticker_fin_is_qt_list, 
    ticker_fin_bs_list, 
    ticker_fin_bs_qt_list, 
    ticker_fin_cf_list, 
    ticker_fin_cf_qt_list,  
    ticker_comps_list, 
    ticker_key_data_list
) = run_extraction(symbols_list, batch_size)


Cookies for current session saved.
Running extraction for HSBC...Running extraction for AAPL...
Running extraction for SONY...

Running extraction for ORCL...
Running extraction for ANET...
Processed income statement
Processed income statement
Processed income statement
Processed income statement
Processed income statement
Processed income statement quarterly
Processed income statement quarterly
Processed income statement quarterly
Processed income statement quarterly
Processed income statement quarterly
Processed balance sheet annual
Processed balance sheet annual
Processed balance sheet annual
Processed balance sheet quarterly
Processed balance sheet annual
Processed balance sheet quarterly
Processed balance sheet quarterly
Processed balance sheet annual
Processed balance sheet quarterly
Processed cash flow annually
Processed cash flow annually
Processed cash flow annually
Processed cash flow annually
Cannot extract for HSBC.
Running extraction for MS...
Processed balance sheet quart

In [27]:
ticker_fin_is = [item for list_ in ticker_fin_is_list for item in list_]
ticker_fin_is_qt = [item for list_ in ticker_fin_is_qt_list for item in list_]
ticker_fin_bs = [item for list_ in ticker_fin_bs_list for item in list_]
ticker_fin_bs_qt = [item for list_ in ticker_fin_bs_qt_list for item in list_]
ticker_fin_cf = [item for list_ in ticker_fin_cf_list for item in list_]
ticker_fin_cf_qt = [item for list_ in ticker_fin_cf_qt_list for item in list_]
ticker_comps = [item for list_ in ticker_comps_list for item in list_]
ticker_key_data = [item for list_ in ticker_key_data_list for item in list_]

In [30]:
ticker_key_data_proc = [{ticker: transform_key_data(pd.DataFrame(data))} for data in ticker_key_data for ticker in data.keys()]
ticker_comps_proc = [{ticker: transform_comps(pd.DataFrame(data), ticker)} for data in ticker_comps for ticker in data.keys()]
ticker_fin_is_proc = [{ticker: transform_financials(pd.DataFrame(data), ticker, '5-year trend')} for data in ticker_fin_is for ticker in data.keys()]
ticker_fin_is_qt_proc = [{ticker: transform_financials(pd.DataFrame(data), ticker, '5- qtr trend')} for data in ticker_fin_is_qt for ticker in data.keys()]
ticker_fin_bs_proc = [{ticker: transform_financials(pd.DataFrame(data), ticker, '5-year trend')} for data in ticker_fin_bs for ticker in data.keys()]
ticker_fin_bs_qt_proc = [{ticker: transform_financials(pd.DataFrame(data), ticker, '5- qtr trend')} for data in ticker_fin_bs_qt for ticker in data.keys()]
ticker_fin_cf_proc = [{ticker: transform_financials(pd.DataFrame(data), ticker, '5-year trend')} for data in ticker_fin_cf for ticker in data.keys()]
ticker_fin_cf_qt_proc = [{ticker: transform_financials(pd.DataFrame(data), ticker, '5- qtr trend')} for data in ticker_fin_cf_qt for ticker in data.keys()]

In [45]:
export_to_json(os.path.join(os.getcwd(), 'ticker_outputs'))