### Scraping financial statements from EDGAR 

In [1]:
import bs4 as bs
import requests
import pandas as pd
import re

company = 'Facebook Inc'
filing = '10-Q'
year = 2020
quarter = 'QTR3'
#get name of all filings 
download = requests.get(f'https://www.sec.gov/Archives/edgar/full-index/{year}/{quarter}/master.idx').content
download = download.decode("utf-8").split('\n')

In [2]:
#build the first part of the url
for item in download:
  #company name and report type
  if (company in item) and (filing in item): 
    #print(item)
    company = item
    company = company.strip()
    splitted_company = company.split('|')
    url = splitted_company[-1]

print(url) #edgar/data/1326801/0001326801-20-000076.txt

url2 = url.split('-') 
url2 = url2[0] + url2[1] + url2[2]
url2 = url2.split('.txt')[0]
print(url2) #edgar/data/1326801/000132680120000076

to_get_html_site = 'https://www.sec.gov/Archives/' + url
data = requests.get(to_get_html_site).content
data = data.decode("utf-8") 
data = data.split('FILENAME>')
#data[1]
data = data[1].split('\n')[0]

url_to_use = 'https://www.sec.gov/Archives/'+ url2 + '/'+data
print(url_to_use)

edgar/data/1326801/0001326801-20-000076.txt
edgar/data/1326801/000132680120000076
https://www.sec.gov/Archives/edgar/data/1326801/000132680120000076/fb-06302020x10q.htm


In [3]:
resp = requests.get(url_to_use)
soup = bs.BeautifulSoup(resp.text, 'lxml')

### Analysing SEC Edgar Annual Reports with Python

Sample questions: 

- What are the company key competitors?
- What risks is the company facing?

In [6]:
import nltk
nltk.download('punkt')

#Change word_to_analyze to search for keywords

word_to_analyze = 'compete'
for tag in soup.div.find_all_next('span'):
    #print(type(tag))
    tag = tag.getText()
    #print(tag)
    if word_to_analyze in tag:
      sentences = nltk.sent_tokenize(tag)
      result = [sentence for sentence in sentences]
      print(result)

['We compete with companies that sell advertising, as well as with companies that provide social, media, and communication products and services that are designed to engage users on mobile devices and online.', 'We face significant competition in every aspect of our business, including from companies that facilitate communication and the sharing of content and information, companies that enable marketers to display advertising, companies that distribute video and other forms of media content, and companies that provide development platforms for applications developers.', 'We compete with companies that offer products across broad platforms that replicate capabilities we provide.', 'For example, among other areas, we compete with Apple in messaging, Google and YouTube in advertising and video, Tencent and Snap in messaging and social media, ByteDance and Twitter in social media, and Amazon in advertising.', 'We also compete with companies that provide regional social networks and messag

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eunicepark/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Scraping Balance Sheet from EDGAR using Python

In [8]:
def balance_Sheet(soup,year,quarter):
  table = soup.find(text="Current assets:").find_parent("table")
  items = []
  values = []

  dict_value = {}
  dict_value[str(year) + quarter] = {}
  name_key = 0

  for row in table.findAll('tr')[3:]:
      #1 Get name of the Table Row Index
          try:
            item = row.find("ix:nonfraction").attrs['name']
            item = item.replace('us-gaap:','')
          except:
            continue
      #2 Get value and transform the number to an appropiate format
          try:
            value = row.find("ix:nonfraction").text
            
            try:
              value = value.replace(',','')
            except:
              
              value
            try:
              value = value.replace('(','-')
            except:

              value
            try:
              #if there is a sign, we need to add it
              sign = row.find("ix:nonfraction")["sign"]
              value = sign + value 
            except:
              value
            try:
              value = float(value)
            except:
              print(value + ' 5')
          except:
            value = ''
        #3 Add elements to dictionaries
          dict_value[str(year) + quarter][item] = value
          
 #4Convert to DataFrames
  BS = pd.DataFrame(dict_value)

  #To keep the column order. No needed if Python version is higher than 3.7
  #for dict value we need to have 2020QTR3
  column_order = list(dict_value[str(year) + quarter].keys())

  BS = BS.T
  BS = BS[column_order]
  BS = BS.T

  return BS

balance_Sheet(soup,year,quarter)

 5
— 5


Unnamed: 0,2020QTR3
CashAndCashEquivalentsAtCarryingValue,21045
AvailableForSaleSecuritiesDebtSecuritiesCurrent,37195
AccountsReceivableNetCurrent,7483
PrepaidExpenseAndOtherAssetsCurrent,2407
AssetsCurrent,68130
PropertyPlantAndEquipmentNet,39006
OperatingLeaseRightOfUseAsset,9429
IntangibleAssetsNetExcludingGoodwill,859
Goodwill,19029
OtherAssetsNoncurrent,3238
