In [1]:
# Generate the list of index files archived in EDGAR since start_year (earliest: 1993) until the most recent quarter
import datetime
 
current_year = datetime.date.today().year
current_quarter = (datetime.date.today().month - 1) // 3 + 1
start_year = 2007
years = list(range(start_year, current_year))
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
history = [(y, q) for y in years for q in quarters]
for i in range(1, current_quarter + 1):
    history.append((current_year, 'QTR%d' % i))
urls = ['https://www.sec.gov/Archives/edgar/full-index/%d/%s/crawler.idx' % (x[0], x[1]) for x in history]
urls.sort()
 
# Download index files and write content into SQLite
import sqlite3
import requests
 
con = sqlite3.connect('edgar_htm_idx.db')
cur = con.cursor()
cur.execute('DROP TABLE IF EXISTS idx')
cur.execute('CREATE TABLE idx (conm TEXT, type TEXT, cik TEXT, date TEXT, path TEXT)')
 
for url in urls:
    lines = requests.get(url).text.splitlines()
    nameloc = lines[7].find('Company Name')
    typeloc = lines[7].find('Form Type')
    cikloc = lines[7].find('CIK')
    dateloc = lines[7].find('Date Filed')
    urlloc = lines[7].find('URL')
    records = [tuple([line[:typeloc].strip(), line[typeloc:cikloc].strip(), line[cikloc:dateloc].strip(),
                      line[dateloc:urlloc].strip(), line[urlloc:].strip()]) for line in lines[9:]]
    cur.executemany('INSERT INTO idx VALUES (?, ?, ?, ?, ?)', records)
    print(url, 'downloaded and wrote to SQLite')
 
con.commit()
con.close()
 
# Write SQLite database to Stata
import pandas
from sqlalchemy import create_engine
 
engine = create_engine('sqlite:///edgar_htm_idx.db')
with engine.connect() as conn, conn.begin():
    data = pandas.read_sql_table('idx', conn)
    data.to_stata('edgar_htm_idx.dta')

https://www.sec.gov/Archives/edgar/full-index/2007/QTR1/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2007/QTR2/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2007/QTR3/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2007/QTR4/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2008/QTR1/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2008/QTR2/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2008/QTR3/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2008/QTR4/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2009/QTR1/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/2009/QTR2/crawler.idx downloaded and wrote to SQLite
https://ww

In [3]:
import sys
!{sys.executable} -m pip install selenium

Collecting selenium
  Using cached https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl
Installing collected packages: selenium
Successfully installed selenium-3.141.0


In [4]:
import csv
import random
import time
import selenium
from selenium import webdriver
 
with open('log.csv', 'w', newline='') as log:
    logwriter = csv.writer(log)
 
    with open('sample.csv', newline='') as infile:
        records = csv.reader(infile)
 
        for r in records:
            log_row = r.copy()
            print('Start fetching URL to', r[2], r[3], 'filed on', r[4], '...')
            start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
 
            driver = webdriver.Chrome('./chromedriver')
 
            try:
                driver.get(r[5])
                time.sleep(3 + random.random() * 3)
                filing_date = driver.find_element_by_xpath('//*[@id="formDiv"]/div[2]/div[1]/div[2]').text
                period_of_report = driver.find_element_by_xpath('//*[@id="formDiv"]/div[2]/div[2]/div[2]').text
                form_text = driver.find_element_by_xpath('//*[@id="formDiv"]/div/table/tbody/tr[2]/td[3]/a').text
                form_link = driver.find_element_by_link_text(form_text).get_attribute('href')
                end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                print('Success!', start_time, ' --> ', end_time, '\n')
                log_row = log_row + [start_time, end_time, filing_date, period_of_report, form_link]
 
            except:
                end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                print('Error!', start_time, ' --> ', end_time, '\n')
                log_row = log_row + [start_time, end_time, 'ERROR!']
 
            driver.quit()
 
            logwriter.writerow(log_row)

FileNotFoundError: [Errno 2] No such file or directory: 'sample.csv'