# Web Scraping the SEC Query Page
Joseph Obonyo  
Source code from: [Sigma Coding](https://github.com/josephobonyo/sigma_coding_youtube/tree/master/python/python-finance/sec-web-scraping) + expanded code

In [1]:
# import our libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [71]:
# base URL for the SEC EDGAR browser
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'owner':'exclude',
              'company':'walt disney'}  # Replace the company value with the one you want to search for

# request the url, and then parse the response.
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, 'html.parser')

# Let the user know it was successful.
print('Request Successful')
print(response.url)

Request Successful
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&owner=exclude&company=walt+disney


In [102]:
# base URL for the SEC EDGAR browser
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'CIK':'0001744489',
              'type':'8-k',
              'dateb':'20210101',
              'owner':'exclude',
              'start':'',
              'output':'',
              'count':'5'}

# request the url, and then parse the response.
headers = {"User-Agent": "My-User_Agent", 'From': 'jobonyo@drew.edu'}
response = requests.get(url = endpoint, params = param_dict, headers=headers)  
soup = BeautifulSoup(response.content, 'html.parser')

# Let the user know it was successful.
print('Request Successful')
print(response.url)

Request Successful
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001744489&type=8-k&dateb=20210101&owner=exclude&start=&output=&count=5


In [103]:
# find the document table with our data
doc_table = soup.find_all('table', class_='tableFile2')

# define a base url that will be used for link building.
base_url_sec = r"https://www.sec.gov"

master_list = []

# loop through each row in the table.
for row in doc_table[0].find_all('tr'):
    
    # find all the columns
    cols = row.find_all('td')
    
    # if there are no columns move on to the next row.
    if len(cols) != 0:        
        
        # grab the text
        filing_type = cols[0].text.strip()                 
        filing_date = cols[3].text.strip()
        filing_numb = cols[4].text.strip()
        
        # find the links
        filing_doc_href = cols[1].find('a', {'href':True, 'id':'documentsbutton'})       
        filing_int_href = cols[1].find('a', {'href':True, 'id':'interactiveDataBtn'})
        filing_num_href = cols[4].find('a')
        
        # grab the the first href
        if filing_doc_href != None:
            filing_doc_link = base_url_sec + filing_doc_href['href'] 
        else:
            filing_doc_link = 'no link'
        
        # grab the second href
        if filing_int_href != None:
            filing_int_link = base_url_sec + filing_int_href['href'] 
        else:
            filing_int_link = 'no link'
        
        # grab the third href
        if filing_num_href != None:
            filing_num_link = base_url_sec + filing_num_href['href'] 
        else:
            filing_num_link = 'no link'
            
        # make request using user-agent to the filings page
        headers = {"User-Agent": "My-User_Agent", 'From': 'jobonyo@drew.edu'}
        response = requests.get(url = filing_doc_link, headers=headers) 
        soup = BeautifulSoup(response.content, 'html.parser')

        # create a list of all links on the page
        links = [a.get('href') for a in soup.find_all('a')]

        # get the href of the filing link using a unique identifier
        filing_identifier = 'ix'
        filing_href = [string for string in links if filing_identifier in string]

        # convert list element to string
        filing_href = ' '.join([str(elem) for elem in filing_href])

        # join the href to the rest of the url
        filing_link = base_url_sec + str(filing_href)
        
        # create and store data in the dictionary
        file_dict = {}
        file_dict['file_type'] = filing_type
        file_dict['file_number'] = filing_numb
        file_dict['file_date'] = filing_date
        file_dict['links'] = {}
        file_dict['links']['documents'] = filing_doc_link
        file_dict['links']['interactive_data'] = filing_int_link
        file_dict['links']['filing_number'] = filing_num_link
        file_dict['links']['filing_link'] = filing_link
    
        # let the user know it's working
        print('-'*100)        
        print("Filing Type: " + filing_type)
        print("Filing Date: " + filing_date)
        print("Filing Number: " + filing_numb)
        print("Document Link: " + filing_doc_link)
        print("Filing Number Link: " + filing_num_link)
        print("Interactive Data Link: " + filing_int_link)
        print("Link to HTML File: " + filing_link)
        
        # append dictionary to master list
        master_list.append(file_dict)

----------------------------------------------------------------------------------------------------
Filing Type: 8-K
Filing Date: 2020-12-07
Filing Number: 001-38842201373408
Document Link: https://www.sec.gov/Archives/edgar/data/1744489/000174448920000223/0001744489-20-000223-index.htm
Filing Number Link: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&filenum=001-38842&owner=exclude&count=10
Interactive Data Link: https://www.sec.gov/cgi-bin/viewer?action=view&cik=1744489&accession_number=0001744489-20-000223&xbrl_type=v
Link to HTML File: https://www.sec.gov/ix?doc=/Archives/edgar/data/1744489/000174448920000223/dis-20201203.htm
----------------------------------------------------------------------------------------------------
Filing Type: 8-K
Filing Date: 2020-11-12
Filing Number: 001-38842201306643
Document Link: https://www.sec.gov/Archives/edgar/data/1744489/000174448920000189/0001744489-20-000189-index.htm
Filing Number Link: https://www.sec.gov/cgi-bin/browse-edga