# Capstone Project Part 02: Webscraping from SEC's EDGAR

In [1]:
import pandas as pd

# A small library to access files from SEC's edgar.
from edgar.company import Company
from edgar.xbrl import XBRL, XBRLElement

In [2]:
df = pd.read_csv('../data/sp500.csv', dtype='str')
df

Unnamed: 0,ticker,security,sector,cik
0,MMM,3M Company,Industrial Conglomerates,0000066740
1,ABT,Abbott Laboratories,Health Care Equipment,0000001800
2,ABBV,AbbVie Inc.,Pharmaceuticals,0001551152
3,ABMD,ABIOMED Inc,Health Care Equipment,0000815094
4,ACN,Accenture plc,IT Consulting & Other Services,0001467373
...,...,...,...,...
500,YUM,Yum! Brands Inc,Restaurants,0001041061
501,ZBRA,Zebra Technologies,Electronic Equipment & Instruments,0000877212
502,ZBH,Zimmer Biomet Holdings,Health Care Equipment,0001136869
503,ZION,Zions Bancorp,Regional Banks,0000109380


In [3]:
securities = []
dicts = []
context_refs = []

for ticker_count, i in enumerate(df.index):
    # To get XBRL data
    security = df.loc[i, 'security']
    company = Company(
        name=security, # company name
        cik=df.loc[i, 'cik'] # company CIK (Central Index Key) number
    )
    results = company.get_data_files_from_10K(
        document_type="EX-101.INS", # Type of document requested
        
        # Number of documents to be retrieved, i.e. FY2009 to FY2018 when 'EX-101.INS' format is applicable
        no_of_documents=10, 
        
        # Default: False. 
        # By default, things aren't case sensitive and is parsed with `html` in `lxml`. 
        # If this is True, then it is parsed with `etree` which is case sensitive
        isxml=True 
    )
    c = 0
    for xbrl in [
        XBRL(xbrl) # Parses data from XBRL
        for xbrl in results
    ]:
        # get children that are not `context`, `unit`, `schemaRef`
        # cleans tags
        for e in xbrl.relevant_children_parsed: 
            c += 1
            dicts.append(XBRLElement(e).to_dict()) # returns a dictionary of name, value, and schemaRef
            try:
                context_refs.append(e.attrib['contextRef'])
            except:
                context_refs.append(None)
    securities += ([security] * c)
    if ticker_count % 10 == 0:  
        print('{} completed.'.format(ticker_count))
        pd.concat(
            (pd.Series(securities, name='security'), pd.DataFrame(dicts), pd.Series(context_refs, name='context_ref')),
            axis=1
        ).to_csv('../data/edgar.csv', index=False)
print('All completed.')
pd.concat(
    (pd.Series(securities, name='security'), pd.DataFrame(dicts), pd.Series(context_refs, name='context_ref')),
    axis=1
).to_csv('../data/edgar.csv', index=False)

0 completed.
10 completed.
20 completed.
30 completed.
40 completed.
50 completed.
60 completed.
70 completed.
80 completed.
90 completed.
100 completed.
110 completed.
120 completed.
130 completed.
140 completed.
150 completed.
160 completed.
170 completed.
180 completed.
190 completed.
200 completed.
210 completed.
220 completed.
230 completed.
240 completed.
250 completed.
260 completed.
270 completed.
280 completed.
290 completed.
300 completed.
310 completed.
320 completed.
330 completed.
340 completed.
350 completed.
360 completed.
370 completed.
380 completed.
390 completed.
400 completed.
410 completed.
420 completed.
430 completed.
440 completed.
450 completed.
460 completed.
470 completed.
480 completed.
490 completed.
500 completed.
All completed.
