In [1]:
import requests
import urllib
from bs4 import BeautifulSoup

# define a function for make url
def make_url(base_url, comp):
    url = base_url
    for r in comp:
        url = '{}/{}'.format(url, r)
    return url

base_url = r'https://www.sec.gov/Archives/edgar/data'
components = ['886982', '000156459019011378', '0001564590-19-011378-index-headers.html']
make_url(base_url, components)

'https://www.sec.gov/Archives/edgar/data/886982/000156459019011378/0001564590-19-011378-index-headers.html'

In [5]:
# base url for the daily index files
base_url = r"https://www.sec.gov/Archives/edgar/daily-index"

# create the daily index url for 2021
year_url = make_url(base_url, ['2021', 'index.json'])
print(year_url)

headers = {'User-Agent': 'miaowa.pro'}

content = requests.get(year_url, headers=headers)
decoded_content = content.json()
decoded_content


https://www.sec.gov/Archives/edgar/daily-index/2021/index.json


{'directory': {'item': [{'last-modified': '03/31/2021 10:08:18 PM',
    'name': 'QTR1',
    'type': 'dir',
    'href': 'QTR1/',
    'size': '20 KB'},
   {'last-modified': '07/10/2021 03:24:30 PM',
    'name': 'QTR2',
    'type': 'dir',
    'href': 'QTR2/',
    'size': '20 KB'},
   {'last-modified': '09/30/2021 10:06:33 PM',
    'name': 'QTR3',
    'type': 'dir',
    'href': 'QTR3/',
    'size': '20 KB'},
   {'last-modified': '10/01/2021 12:20:13 AM',
    'name': 'QTR4',
    'type': 'dir',
    'href': 'QTR4/',
    'size': '4 KB'}],
  'name': 'daily-index/2021/',
  'parent-dir': '../'}}

In [70]:
for item in decoded_content['directory']['item']:
    
    # just get master index for QTR4
    if item['name'] != 'QTR3' and item['name'] != 'QTR4':
        continue
        
    # get the name of the folder
    print('-' * 100)
    print('Pulling url for quarter {}'.format(item['name']))
    
    # create the qtr url
    qtr_url = make_url(base_url, ['2021', item['name'], 'index.json'])
    print(qtr_url)
    
    # request the url and decode it.
    file_content = requests.get(qtr_url, headers=headers)
    decoded_file_content = file_content.json()
    
    print('-' * 100)
    print('Pulling file')
    
    for file in decoded_file_content['directory']['item']:
        
        if file['name'].startswith('master'):
            file_url = make_url(base_url, ['2021', item['name'], file['name']])
            print(file_url)
                                       

----------------------------------------------------------------------------------------------------
Pulling url for quarter QTR3
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/index.json
----------------------------------------------------------------------------------------------------
Pulling file
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210701.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210702.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210706.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210707.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210708.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210709.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210712.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210713.idx
https://www.sec.gov/Archives/edgar/daily-index/2021/QTR3/master.20210714

In [65]:
file_url = r'https://www.sec.gov/Archives/edgar/daily-index/2021/QTR4/master.20211112.idx'

# content = requests.get(file_url, headers=headers).content

# with open('./data/master.20211112.txt', 'wb') as f:
#     f.write(content)


In [71]:
# read the master index file
with open('./data/master.20211112.txt', 'rb') as f:
    byte_data = f.read()
    
data = byte_data.decode('utf-8').split('\n')
# print(data)

for index, item in enumerate(data):
    if '-----' in item:
        start_ind = index

data_set = data[start_ind + 1:]

master_data = []

# CIK|Company Name|Form Type|Date Filed|File Name
for index, line in enumerate(data_set):
    
    fields = line.split('|')
#     print(index, fields)
    
    if len(fields) == 5:
        
        record = {}
        record['CIK'] = fields[0]
        record['COMPANY_NAME'] = fields[1]
        record['FORM_TYPE'] = fields[2]
        record['DATE'] = fields[3]
        record['FILE_NAME'] = fields[4]
    
        master_data.append(record)
    

# print(master_data)

# find out 10-K & 10-Q
master_data_10_K = []
master_data_10_Q = []
master_data_6_K = []

for record in master_data:
    
    if record['FORM_TYPE'] == '10-K':
        master_data_10_K.append(record)
        
    if record['FORM_TYPE'] == '10-Q':
        master_data_10_Q.append(record)
        
    if record['FORM_TYPE'] == '6-K':
        master_data_6_K.append(record)

print('10-K:', len(master_data_10_K))
print('10-Q:', len(master_data_10_Q))
print('6-K:', len(master_data_6_K))

print('-' * 100)

for data in master_data_10_K:
    print(data['COMPANY_NAME'], 'https://www.sec.gov/Archives/' + data['FILE_NAME'])
    
print('-' * 100)

for data in master_data_6_K:
    
    cik_num = data['CIK']
    if cik_num == '1736541':
        print(data['COMPANY_NAME'], 'https://www.sec.gov/Archives/' + data['FILE_NAME'])

10-K: 12
10-Q: 734
6-K: 267
----------------------------------------------------------------------------------------------------
CMC Materials, Inc. https://www.sec.gov/Archives/edgar/data/1102934/0001102934-21-000052.txt
VERDE RESOURCES, INC. https://www.sec.gov/Archives/edgar/data/1506929/0001640334-21-002756.txt
Clancy Corp https://www.sec.gov/Archives/edgar/data/1681769/0001520138-21-000713.txt
Elite Performance Holding Corp https://www.sec.gov/Archives/edgar/data/1753681/0001477932-21-008045.txt
KURA SUSHI USA, INC. https://www.sec.gov/Archives/edgar/data/1772177/0001564590-21-056423.txt
Phoenix Plus Corp. https://www.sec.gov/Archives/edgar/data/1785493/0001493152-21-028105.txt
Hill-Rom Holdings, Inc. https://www.sec.gov/Archives/edgar/data/47518/0000047518-21-000084.txt
MONMOUTH REAL ESTATE INVESTMENT CORP https://www.sec.gov/Archives/edgar/data/67625/0001493152-21-028191.txt
ATMOS ENERGY CORP https://www.sec.gov/Archives/edgar/data/731802/0000731802-21-000040.txt
LANDSTAR SYSTEM