In [1]:
# Generate the list of index files archived in EDGAR since start_year (earliest: 1993) until the most recent quarter
import datetime

current_year = datetime.date.today().year
current_quarter = (datetime.date.today().month - 1) // 3 + 1
start_year = 1993
years = list(range(start_year, current_year))
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
history = [(y, q) for y in years for q in quarters]
for i in range(1, current_quarter + 1):
    history.append((current_year, 'QTR%d' % i))
urls = ['https://www.sec.gov/Archives/edgar/full-index/%d/%s/crawler.idx' % (x[0], x[1]) for x in history]
urls.sort()

# Download index files and write content into SQLite
import sqlite3
import requests

con = sqlite3.connect('edgar_htm_idx.db')
cur = con.cursor()
cur.execute('DROP TABLE IF EXISTS idx')
cur.execute('CREATE TABLE idx (conm TEXT, type TEXT, cik TEXT, date TEXT, path TEXT)')

for url in urls:
    lines = requests.get(url).text.splitlines()
    nameloc = lines[7].find('Company Name')
    typeloc = lines[7].find('Form Type')
    cikloc = lines[7].find('CIK')
    dateloc = lines[7].find('Date Filed')
    urlloc = lines[7].find('URL')
    records = [tuple([line[:typeloc].strip(), line[typeloc:cikloc].strip(), line[cikloc:dateloc].strip(),
                      line[dateloc:urlloc].strip(), line[urlloc:].strip()]) for line in lines[9:]]
    cur.executemany('INSERT INTO idx VALUES (?, ?, ?, ?, ?)', records)
    print(url, 'downloaded and wrote to SQLite')

con.commit()
con.close()

# Write SQLite database to Stata
import pandas
from sqlalchemy import create_engine

engine = create_engine('sqlite:///edgar_htm_idx.db')
with engine.connect() as conn, conn.begin():
    data = pandas.read_sql_table('idx', conn)
    data.to_stata('edgar_htm_idx.dta')

https://www.sec.gov/Archives/edgar/full-index/1993/QTR1/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1993/QTR2/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1993/QTR3/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1993/QTR4/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1994/QTR1/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1994/QTR2/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1994/QTR3/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1994/QTR4/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1995/QTR1/crawler.idx downloaded and wrote to SQLite
https://www.sec.gov/Archives/edgar/full-index/1995/QTR2/crawler.idx downloaded and wrote to SQLite
https://ww

In [167]:
import csv
import random
import time

from selenium import webdriver

with open('Data/log.csv', 'w', newline='') as log:
    logwriter = csv.writer(log)

    with open('Data/df2_late_test.csv', newline='') as infile:
        records = csv.reader(infile)

        for r in records:
            log_row = r.copy()
            print('Start fetching URL to', r[2], r[3], 'filed on', r[4], '...')
            start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

            driver = webdriver.Chrome('chromedriver.exe')

            try:
                driver.get(r[5])
                time.sleep(3 + random.random() * 3)
                filing_date = driver.find_element_by_xpath('//*[@id="formDiv"]/div[2]/div[1]/div[2]').text
                period_of_report = driver.find_element_by_xpath('//*[@id="formDiv"]/div[2]/div[2]/div[2]').text
                form_text = driver.find_element_by_xpath('//*[@id="formDiv"]/div/table/tbody/tr[2]/td[3]/a').text
                form_link = driver.find_element_by_link_text(form_text).get_attribute('href')
                end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                print('Success!', start_time, ' --> ', end_time, '\n')
                log_row = log_row + [form_link]

            except:
                end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                print('Error!', start_time, ' --> ', end_time, '\n')
                log_row = log_row + [start_time, end_time, 'ERROR!']

            driver.quit()

            logwriter.writerow(log_row)

Start fetching URL to type cik filed on date ...
Error! 2019-12-11 22:20:17  -->  2019-12-11 22:20:20 

Start fetching URL to NT 10-Q 1400000 filed on 2010-01-15 ...
Success! 2019-12-11 22:20:22  -->  2019-12-11 22:20:29 

Start fetching URL to NT 10-Q 1431880 filed on 2010-02-17 ...
Success! 2019-12-11 22:20:31  -->  2019-12-11 22:20:37 

Start fetching URL to NT 10-Q 770034 filed on 2010-02-17 ...
Success! 2019-12-11 22:20:39  -->  2019-12-11 22:20:46 

Start fetching URL to NT 10-Q 1261734 filed on 2010-02-16 ...
Success! 2019-12-11 22:20:48  -->  2019-12-11 22:20:56 

Start fetching URL to NT 10-Q 1066717 filed on 2010-02-12 ...
Success! 2019-12-11 22:20:58  -->  2019-12-11 22:21:04 



In [123]:

import pandas as pd

#Load the dataset
df2 = pd.read_stata("edgar_htm_idx.dta")

In [165]:
df2_late = df2[df2['type']=='NT 10-Q']
df2_late.head()

Unnamed: 0,index,conm,type,cik,date,path
253,253,AIR & WATER TECHNOLOGIES CORP,NT 10-Q,823556,1994-03-18,https://www.sec.gov/Archives/edgar/data/823556...
596,596,AMERICAN RICE INC,NT 10-Q,824206,1994-02-14,https://www.sec.gov/Archives/edgar/data/824206...
3114,3114,COMPREHENSIVE CARE CORP,NT 10-Q,22872,1994-01-14,https://www.sec.gov/Archives/edgar/data/22872/...
3143,3143,CONCURRENT COMPUTER CORP/DE,NT 10-Q,749038,1994-02-14,https://www.sec.gov/Archives/edgar/data/749038...
3650,3650,DATAPOINT CORP,NT 10-Q,205239,1994-03-15,https://www.sec.gov/Archives/edgar/data/205239...


In [125]:
df2_10Q = df2[df2['type']=='10-Q']

In [126]:
df2_10Q_2010 = df2_10Q[df2_10Q['date']>'2010']

In [127]:
df2_late_2010 = df2_late[df2_late['date']>'2010']

In [152]:
df2_late_test = df2_late_2010.tail()

In [166]:
df2_late_2010.to_csv("Data/df2_late10Q.csv", index = False)
df2_10Q_2010.to_csv("Data/df2_10Q.csv", index = False)

In [164]:
df2_late_test_df = pd.read_csv("log.csv")

Unnamed: 0,index,conm,type,cik,date,path,html link,2019-12-09 13:14:50,ERROR!
0,9536460,"ABAKAN, INC",NT 10-Q,1400000,2010-01-15,https://www.sec.gov/Archives/edgar/data/140000...,https://www.sec.gov/Archives/edgar/data/140000...,,
1,9538671,ADTOMIZE INC,NT 10-Q,1431880,2010-02-17,https://www.sec.gov/Archives/edgar/data/143188...,https://www.sec.gov/Archives/edgar/data/143188...,,
2,9538728,ADVANCE DISPLAY TECHNOLOGIES INC,NT 10-Q,770034,2010-02-17,https://www.sec.gov/Archives/edgar/data/770034...,https://www.sec.gov/Archives/edgar/data/770034...,,
3,9539728,"AEOLUS PHARMACEUTICALS, INC.",NT 10-Q,1261734,2010-02-16,https://www.sec.gov/Archives/edgar/data/126173...,https://www.sec.gov/Archives/edgar/data/126173...,,
4,9542221,"ALKANE, INC.",NT 10-Q,1066717,2010-02-12,https://www.sec.gov/Archives/edgar/data/106671...,https://www.sec.gov/Archives/edgar/data/106671...,,


In [151]:
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup


url = "https://www.sec.gov/Archives/edgar/data/1261734/000114420410007988/v174477_nt10q.htm"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)

# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = soup.body.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

#for line in re.findall("Part III.*", text):
    #print(line)
print(text)

NT 10-Q
1
v174477_nt10q.htm
UNITED
STATES
SECURITIES
AND EXCHANGE COMMISSION
Washington,
D.C. 20549
FORM
12b-25
SEC
FILE NUMBER
000-50481
NOTIFICATION
OF LATE FILING
CUSIP
NUMBER
00765G109
(Check One):  o Form
10-K    o Form
20-F    o Form
11-K    x Form
10-Q    o Form
10-D    o Form N-SAR
o Form
N-CSR
For Period Ended: December
31,
2009
o Transition Report
on Form 10-K
o Transition Report
on Form 20-F
o Transition Report
on Form 11-K
o Transition Report
on Form 10-Q
o Transition Report
on Form N-SAR
For the
Transition Period Ended:
_____________________________________________________
Read
Instruction (on back page) Before Preparing Form. Please Print or
Type.
Nothing
in this form shall be construed to imply that the Commission has verified any
information contained herein.
If the
notification relates to a portion of the filing checked above, identify the
Item(s) to which the notification relates:
PART
I — REGISTRANT INFORMATION
Aeolus Pharmaceuticals, Inc.
Full Name of Registrant
For

In [102]:
text

"NT 10-Q\n1\ndoc1.htm\nForm 12b-25: Notification of Late Filing\nUnited States\nSecurities and Exchange Commission\nWashington, D.C. 20549\nFORM 12b-25Notification of Late Filing\n(Amendment No.\n0)*\nOMB Number\n3234-0058\nSEC File Number0-15224CUSIP Number0074223 06 5(Check one):\nForm 10-K\nForm 20-F\nForm 11-K\n√\nForm 10-Q\nForm 10-D\nForm N-SAR\nForm N-CSRFor Period Ended:December\xa031,\xa02009\nTransition Report on Form 10-K\nTransition Report on Form 20-F\nTransition Report on Form 11-K\nTransition Report on Form 10-Q\nTransition Report on Form N-SARFor the Transition Period Ended:\nRead Instruction (on back page) Before Preparing Form. Please Print or Type.\nNothing in this form shall be construed to imply that the Commission has verified any information contained herein.\nIf the notification relates to a portion of the filing checked above, identify the Item(s) to which the notification relates:\nPart I - Registrant InformationAdvance Display Technologies, Inc.Full Name of R

In [114]:

import re

for line in re.findall("Part III.*", text):
    print(line)
    
    #if line in re.findall("IV — OTHER INFORMATION.*", text):


Part III of this form could not be eliminated without unreasonable effort or expense.(b)The subject annual report, semi-annual report, transition report on Form 10-K, Form 20-F, Form 11-K, Form N-SAR or Form N-CSR, or portion thereof, will be filed on or before the fifteenth calendar day following the prescribed due date; or the subject quarterly report or transition report on Form 10-Q or subject distribution report on Form 10-D, or portion thereof, will be filed on or before the fifth calendar day following the prescribed due date; and(c)The accountant's statement or other exhibit required by Rule 12b-25(c) has been attached if applicable.Part III - NarrativeState below in reasonable detail why Forms 10-K, 20-F, 11-K, 10-Q,10-D, N-SAR, N-CSR, or the transition report or portion thereof, could not be filed within the prescribed time period.As the result of some recent turnover in the Company's accounting department and the Company's limited resources generally, the Company could not c

In [106]:
for line in re.findall("III - NARRATIVE.*", text):
    print(line)
    
#for line in re.findall("IV — OTHER INFORMATION.*", text):
    
    