In [1]:
import re
import os
import unicodedata
import pandas as pd
import numpy as np
import requests
import bs4 as bs

from lxml import html
from tqdm import tqdm
from bs4 import BeautifulSoup
from time import gmtime, strftime
from datetime import datetime, timedelta

### 1. Get CIK and DJIA Data

In [2]:
# yahoo_url = "https://finance.yahoo.com/quote/%5EDJI/components/"
# djia_table = pd.read_html(yahoo_url, header=0, index_col=0)[0]
# djia_table = djia_table.reset_index()

In [3]:
# djia_table.head()

In [4]:
# wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
# cik_df = pd.read_html(wiki_url,header=0,index_col=0)[0]
# cik_df['GICS Sector'] = cik_df['GICS Sector'].astype("category")
# cik_df['GICS Sub Industry'] = cik_df['GICS Sector'].astype("category")
# cik_df = cik_df.reset_index()
# cik_df.head()
cik_df = pd.read_csv("cik_mapper.csv")
cik_df.head()

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrials,"St. Paul, Minnesota",,66740,1902.0
1,AXP,American Express Co,reports,Financials,Financials,"New York, New York",30/6/1976,4962,1850.0
2,AAPL,Apple Inc.,reports,Information Technology,Information Technology,"Cupertino, California",30/11/1982,320193,1977.0
3,BA,Boeing Company,reports,Industrials,Industrials,"Chicago, Illinois",,12927,1916.0
4,CAT,Caterpillar Inc.,reports,Industrials,Industrials,"Deerfield, Illinois",,18230,1925.0


### 2. Data Scraping

The SEC limits users to 10 requests per second, so we need to make sure we are not making requests too quickly.

In [5]:
def WriteLogFile(log_file_name, text):
    
    '''
    Helper function.
    Writes a log file with all notes and
    error messages from a scraping "session".
    
    Parameters
    ----------
    log_file_name : str
        Name of the log file (should be a .txt file).
    text : str
        Text to write to the log file.
        
    Returns
    -------
    None.
    
    '''
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return

In [6]:
pathname_10k = '../../Raw Data/Data_10K'
pathname_10q = '../../Raw Data/Data_10Q'

In [7]:
def Scrape10K(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name, start_year=2010):
    
    '''
    Scrapes all 10-Ks for a particular CIK from EDGAR.
    
    Args
        browse_url_base : str
            Base URL for browsing EDGAR.
        filing_url_base : str
            Base URL for filings listings on EDGAR.
        doc_url_base : str
            Base URL for one filing's document tables
            page on EDGAR.
        cik : str
            Central Index Key.
        log_file_name : str
            Name of the log file (should be a .txt file).
        start_year : int
            The beginning year in which scraping will start from
        
    Returns
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(cik)
    except OSError:
        print("Already scraped CIK", cik)
        return
    
    # If we haven't, go into the directory for that CIK
    os.chdir(cik)
    
    print('Scraping CIK', cik)
    
    page = 0
    final_df = pd.DataFrame()
    url = browse_url_base % cik
    while True:
        print('---', page, '---')
        
        # Request list of 10-K filings
        res = requests.get(url)
        
        # If the request failed, log the failure and exit
        if res.status_code != 200:
            os.chdir('..')
            os.rmdir(cik) # remove empty dir
            text = "Request failed with error code " + str(res.status_code) + \
                   "\nFailed URL: " + (browse_url_base % cik) + '\n'
            WriteLogFile(log_file_name, text)
            return
        # If the request doesn't fail, continue...
        
        # Parse the response HTML using BeautifulSoup
        soup = bs.BeautifulSoup(res.text, 'lxml')
        
        # Extract all tables from the response
        html_tables = soup.find_all('table')
        
        # Check that the table we're looking for exists
        # If it doesn't, exit
        if len(html_tables)<3:
            print("table too short")
            os.chdir('..')
            return

        # Parse the Filings table
        filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
        filings_table['Filings'] = [str(x) for x in filings_table['Filings']]
        final_df = final_df.append(filings_table)
        final_df = final_df.reset_index().drop(["index"], axis=1)
        final_df["Filing Date"] = pd.to_datetime(final_df["Filing Date"])
        final_df["year"] = final_df["Filing Date"].dt.year
        
        if final_df.iloc[-1]["year"] < start_year:
            final_df = final_df[final_df["year"] >= start_year]
            final_df = final_df.drop(["year"], axis=1)
            break

        if len(soup.find_all("td")[-1].find_all("input")) == 0:
            break
        
        next_page = soup.find_all("td")[-1].find_all("input")[-1].get("onclick")
        flag = soup.find_all("td")[-1].find_all("input")[-1].get("value").split()[0]
        
        if next_page:
            if flag == 'Next':
                next_page = next_page.split("parent.location=")[1].replace('"','').replace("'", '')
                url = "https://www.sec.gov" + next_page
                page += 1
                print(url)
            else:
                break
        else:
            break
    
    filings_table = final_df
    filings_table["Filing Date"] = filings_table["Filing Date"].dt.date.map(lambda x:str(x))        

    # Get only 10-K and 10-K405 document filings
    filings_table = filings_table[(filings_table['Filings'] == '10-K') | (filings_table['Filings'] == '10-K405')]

    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base % (cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base % (cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue

        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-K and 10-K405 entries for the filing
        docs_table = docs_table[(docs_table['Type'] == '10-K') | (docs_table['Type'] == '10-K405')]
        
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        docname = docname.split()[0]
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: %s and Acc_No: %s is unavailable' % (cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        
        # Request the file
        file = requests.get(doc_url_base % (cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base % (cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if '.txt' in docname:
            # Save text as TXT
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        else:
            # Save text as HTML
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        
    # Move back to the main 10-K directory
    os.chdir('..')
        
    return

In [8]:
def Scrape10Q(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name, start_year=2010):
    
    '''
    Scrapes all 10-Qs for a particular CIK from EDGAR.
    
    Args
        browse_url_base : str
            Base URL for browsing EDGAR.
        filing_url_base : str
            Base URL for filings listings on EDGAR.
        doc_url_base : str
            Base URL for one filing's document tables
            page on EDGAR.
        cik : str
            Central Index Key.
        log_file_name : str
            Name of the log file (should be a .txt file).
        start_year : int
            The beginning year in which scraping will start from
        
    Returns
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(cik)
    except OSError:
        print("Already scraped CIK", cik)
        return
    
    # If we haven't, go into the directory for that CIK
    os.chdir(cik)
    
    print('Scraping CIK', cik)
    
    page = 0
    final_df = pd.DataFrame()
    url = browse_url_base % cik
    while True:
        print('---', page, '---')
        
        # Request list of 10-Q filings
        res = requests.get(url)
        
        # If the request failed, log the failure and exit
        if res.status_code != 200:
            os.chdir('..')
            os.rmdir(cik) # remove empty dir
            text = "Request failed with error code " + str(res.status_code) + \
                   "\nFailed URL: " + (browse_url_base % cik) + '\n'
            WriteLogFile(log_file_name, text)
            return
        # If the request doesn't fail, continue...
        
        # Parse the response HTML using BeautifulSoup
        soup = bs.BeautifulSoup(res.text, 'lxml')
        
        # Extract all tables from the response
        html_tables = soup.find_all('table')
        
        # Check that the table we're looking for exists
        # If it doesn't, exit
        if len(html_tables)<3:
            print("table too short")
            os.chdir('..')
            return

        # Parse the Filings table
        filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
        filings_table['Filings'] = [str(x) for x in filings_table['Filings']]
        final_df = final_df.append(filings_table)
        final_df = final_df.reset_index().drop(["index"], axis=1)
        final_df["Filing Date"] = pd.to_datetime(final_df["Filing Date"])
        final_df["year"] = final_df["Filing Date"].dt.year
        
        if final_df.iloc[-1]["year"] < start_year:
            final_df = final_df[final_df["year"] >= start_year]
            final_df = final_df.drop(["year"], axis=1)
            break
            
        if len(soup.find_all("td")[-1].find_all("input")) == 0:
            break
        
        next_page = soup.find_all("td")[-1].find_all("input")[-1].get("onclick")
        flag = soup.find_all("td")[-1].find_all("input")[-1].get("value").split()[0]

        if next_page:
            if flag == 'Next':
                next_page = next_page.split("parent.location=")[1].replace('"','').replace("'", '')
                url = "https://www.sec.gov" + next_page
                page += 1
                print(url)
            else:
                break
        else:
            break
    
    filings_table = final_df
    filings_table["Filing Date"] = filings_table["Filing Date"].dt.date.map(lambda x:str(x))        

    # Get only 10-Q document filings
    filings_table = filings_table[filings_table['Filings'] == '10-Q']

    # If filings table doesn't have any
    # 10-Qs, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base % (cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing    
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base % (cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
            
        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-Q entries for the filing
        docs_table = docs_table[docs_table['Type'] == '10-Q']
        
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        docname = docname.split()[0]
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: %s and Acc_No: %s is unavailable' % (cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        
        # Request the file
        file = requests.get(doc_url_base % (cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base % (cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
            
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if '.txt' in docname:
            # Save text as TXT
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        else:
            # Save text as HTML
            date = str(row['Filing Date'])
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        
    # Move back to the main 10-Q directory
    os.chdir('..')
        
    return

In [9]:
cik_df.CIK = cik_df.CIK.map(lambda x:str(x))

In [10]:
# Run the function to scrape 10-Ks

# Define parameters
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-K'
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

# Set correct directory
os.chdir(pathname_10k)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(cik_df.CIK):
    Scrape10K(browse_url_base=browse_url_base_10k, 
          filing_url_base=filing_url_base_10k, 
          doc_url_base=doc_url_base_10k, 
          cik=cik,
          log_file_name=log_file_name)

In [11]:
os.chdir("../../../Analytics/10K10Q/")

In [12]:
# Run the function to scrape 10-Qs

# Define parameters
browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-Q'
filing_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

# Set correct directory
os.chdir(pathname_10q)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(cik_df.CIK):
    Scrape10Q(browse_url_base=browse_url_base_10q, 
          filing_url_base=filing_url_base_10q, 
          doc_url_base=doc_url_base_10q, 
          cik=cik,
          log_file_name=log_file_name)