In [9]:
import sys
import re
import os
import unicodedata
import pandas as pd
import numpy as np
import pickle
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm
import time

In [2]:
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[1].text
    tickers.append(ticker)
tickers = tickers[:2]

In [10]:
url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
cik_re = re.compile(r'.*CIK=(\d{10}).*')
all_cik = {}
for ticker in tqdm(tickers):
    res = cik_re.findall(requests.get(url.format(ticker)).text)
    if len(res):
        all_cik[str(ticker).lower()] = str(res[0])
        
ciks_tick_df = pd.DataFrame.from_dict(data=all_cik, orient='index')
ciks_tick_df.reset_index(inplace=True)
ciks_tick_df.columns = ['ticker', 'cik']

ciks_tick_df.to_csv('AllSecTickers.csv', sep=',', encoding='utf-8', index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.04it/s]


ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [4]:
def SaveLogFile(log_file_name, text):
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return
#closing log file
def CloseLogFile(log_file_name):
    
    with open(log_file_name, 'a') as log_file:
        log_file.close()

    return

In [5]:
scraped10k_path = "C:/Users/Tanay/SCB Intern Project/Portfolio Construction/Data/scraped_10k"
scraped10q_path = "C:/Users/Tanay/SCB Intern Project/Portfolio Construction/Data/scraped_10q"

In [6]:
def Extract10K(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name):
    
    requestcount=0
    if not os.path.exists(cik):
        os.makedirs(cik)
    
    os.chdir(cik)
    print('Scraping CIK ', cik)
    
    SaveLogFile(log_file_name, 'Scraping CIK: '+cik)
            
    res = requests.get(browse_url_base % cik)
    requestcount+=1
    if(requestcount==10):
        time.sleep(1)
        requestcount=0

    if res.status_code != 200:
        text = "\nFailed at step 1.\n Failed to hit browse base URL for CIK with error " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base % cik) + '\n'
        
        SaveLogFile(log_file_name, text)
            
        return

    SaveLogFile(log_file_name,"\nSuccessfully reached browse base UrL\n")
        
    #HTML parsing using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

 
    html_tables = soup.find_all('table')
    
    if len(html_tables)<3:
        os.chdir('..')
        return
    

    sec_filing_tbl = pd.read_html(str(html_tables[2]), header=0)[0]
    sec_filing_tbl['Filings'] = [str(x) for x in sec_filing_tbl['Filings']]

    sec_filing_tbl = sec_filing_tbl[(sec_filing_tbl['Filings'] == '10-K') | (sec_filing_tbl['Filings'] == '10-K405')]
    sec_filing_tbl = sec_filing_tbl[(sec_filing_tbl['Filing Date'] >= '2008-01-01')]

    if len(sec_filing_tbl)==0:
        os.chdir('..')
        return
    
    sec_filing_tbl['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in sec_filing_tbl['Description']]

   
    SaveLogFile(log_file_name, "Getting all documents for this CIK\n")
    
    total_files_to_scrape = len(sec_filing_tbl)
    files_scraped = 0
    
    for index, row in sec_filing_tbl.iterrows():
        
       
        acc_no = str(row['Acc_No'])
        
        #check if file is already scraped 
        date = str(row['Filing Date'])
        if (os.path.exists(cik + '_' + date + '.html')) or os.path.exists(cik + '_' + date + '.txt'):
            SaveLogFile(log_file_name, "The file for date: " + date + " exists, acc no.: " + acc_no + "\n")
            files_scraped+=1
            continue
            
   
        docs_page = requests.get(filing_url_base % (cik, acc_no))
        requestcount+=1
        if(requestcount==10):
            time.sleep(1)
            requestcount=0
        
        
        #In case of request failure,log it and jump to the next filing 
        if docs_page.status_code != 200:
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base % (cik, acc_no)) + '\n'
            SaveLogFile(log_file_name, text)
            continue
            
        SaveLogFile(log_file_name, "Got acc no. " + acc_no + "\n")
       
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
 
        docs_table = docs_table[(docs_table['Type'] == '10-K') | (docs_table['Type'] == '10-K405')]
        
        if len(docs_table)==0:
            continue
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
        
         # check for nan values, if present then log the failure
        if str(docname) == 'nan':
            text = 'File with CIK: %s and Acc_No: %s is unavailable' % (cik, acc_no) + '\n'
            SaveLogFile(log_file_name, text)
            continue       
        
        file = requests.get(doc_url_base % (cik, acc_no.replace('-', ''), docname.replace(' iXBRL','')))
        requestcount+=1
        if(requestcount==10):
            time.sleep(1)
            requestcount=0
            
       
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base % (cik, acc_no.replace('-', ''), docname)) + '\n'
            SaveLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        SaveLogFile(log_file_name, "Got 10K for acc no. " + acc_no + "\n")
        files_scraped+=1
        
        #save HTML or Text file with .txt extension
        if '.txt' in docname:
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        else:
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
    
    if(total_files_to_scrape!=files_scraped):
        print("Some files failed to scrape\n")
        text="Some files failed to scrape\n" + str(total_files_to_scrape) + "!="+ str(files_scraped) + "\n"
        SaveLogFile(log_file_name, text)
        
    SaveLogFile(log_file_name, "saving log files=================\n")
    CloseLogFile(log_file_name)
    os.chdir('..')
        
    return

In [7]:
def Extract10Q(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name):
    
    requestcount=0

    if not os.path.exists(cik):
        os.makedirs(cik)
    
    os.chdir(cik)
    print('Scraping CIK ', cik)
    
    SaveLogFile(log_file_name, 'Scraping CIK: '+cik)
            
    res = requests.get(browse_url_base % cik)
    requestcount+=1
    if(requestcount==10):
        time.sleep(1)
        requestcount=0
    
     #In case of request failure,log it and jump to the next filing 
    if res.status_code != 200:
        text = "\nFailed at step 1.\n Failed to hit browse base URL for CIK with error " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base % cik) + '\n'
        
        SaveLogFile(log_file_name, text)
            
        return

   
    SaveLogFile(log_file_name,"\nSuccessfully reached browse base UrL\n")
        
    soup = bs.BeautifulSoup(res.text, "lxml")

    # fetching all the html tables
    html_tables = soup.find_all('table')
    
    
    if len(html_tables)<3:
        os.chdir('..')
        return
    
    # Parsing the html filing table
    sec_filing_tbl = pd.read_html(str(html_tables[2]), header=0)[0]
    sec_filing_tbl['Filings'] = [str(x) for x in sec_filing_tbl['Filings']]


    sec_filing_tbl = sec_filing_tbl[sec_filing_tbl['Filings'] == '10-Q']
    sec_filing_tbl = sec_filing_tbl[(sec_filing_tbl['Filing Date'] >= '2008-01-01')]
    
    if len(sec_filing_tbl)==0:
        os.chdir('..')
        return
    
  
    sec_filing_tbl['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in sec_filing_tbl['Description']]

   
    SaveLogFile(log_file_name, "Getting all documents for this CIK\n")
    
    total_files_to_scrape = len(sec_filing_tbl)
    files_scraped = 0
    
    for index, row in sec_filing_tbl.iterrows():
        
     
        acc_no = str(row['Acc_No'])
        
        # check if file already exist
        date = str(row['Filing Date'])
        if (os.path.exists(cik + '_' + date + '.html')) or os.path.exists(cik + '_' + date + '.txt'):
            SaveLogFile(log_file_name, "The file for date: " + date + " exists, acc no.: " + acc_no + "\n")
            files_scraped+=1
            continue
            
      
        docs_page = requests.get(filing_url_base % (cik, acc_no))
        requestcount+=1
        if(requestcount==10):
            time.sleep(1)
            requestcount=0
        
        #In case of request failure,log it and jump to the next filing 
        if docs_page.status_code != 200:
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base % (cik, acc_no)) + '\n'
            SaveLogFile(log_file_name, text)
            continue
            
        SaveLogFile(log_file_name, "Got acc no. " + acc_no + "\n")

        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        
        if len(docs_html_tables)==0:
            continue
            
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        docs_table = docs_table[docs_table['Type'] == '10-Q']
        
        if len(docs_table)==0:
            continue
            
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = docs_table['Document']
    
        # check for nan values, if present then log the failure
        if str(docname) == 'nan':
            text = 'File with CIK: %s and Acc_No: %s is unavailable' % (cik, acc_no) + '\n'
            SaveLogFile(log_file_name, text)
            continue       
        
       
        file = requests.get(doc_url_base % (cik, acc_no.replace('-', ''), docname.replace(' iXBRL','')))
        requestcount+=1
        if(requestcount==10):
            time.sleep(1)
            requestcount=0
            
        #In case of request failure,log it and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base % (cik, acc_no.replace('-', ''), docname)) + '\n'
            SaveLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
      
        SaveLogFile(log_file_name, "Got 10Q for acc no. " + acc_no + "\n")
        files_scraped+=1
        
         #save HTML or Text file with .txt extension
        if '.txt' in docname:
            filename = cik + '_' + date + '.txt'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
        else:
            filename = cik + '_' + date + '.html'
            html_file = open(filename, 'a')
            html_file.write(file.text)
            html_file.close()
    
    if(total_files_to_scrape!=files_scraped):
        print("Some files failed to scrape\n")
        text="Some files failed to scrape\n" + str(total_files_to_scrape) + "!="+ str(files_scraped) + "\n"
        SaveLogFile(log_file_name, text)
        
    SaveLogFile(log_file_name, "saving log files==========================\n")
    CloseLogFile(log_file_name)
    os.chdir('..')
        
    return

In [8]:
# Define parameters for Extract10K method
browse_url_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-K'
filing_url_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

# moving the directory, where scraped 10-K filings are
os.chdir(scraped10k_path)
log_file = 'log.txt'

for cik in tqdm(ciks_tick_df['cik']):
    Extract10K(browse_url_base=browse_url_10k, 
          filing_url_base=filing_url_10k, 
          doc_url_base=doc_url_10k, 
          cik=cik,
          log_file_name=log_file)

KeyError: 'cik'

In [None]:
# To scrape 10-Ks, call the Extract10Q method
# Define parameters for Extract10Q method
browse_url_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-Q&count=1000'
filing_url_10q = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_10q = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

# Set correct directory (fill this out yourself!)
os.chdir(scraped10q_path)
log_file_name = 'log.txt'

# moving the directory, where scraped 10-K filings are
for cik in tqdm(ciks_tick_df['cik']):
    Extract10Q(browse_url_base=browse_url_10q, 
          filing_url_base=filing_url_10q, 
          doc_url_base=doc_url_10q, 
          cik=cik,
          log_file_name=log_file_name)