# Capstone Project Part 05: Webscraping from SEC's EDGAR

## Import libraries and modules

In [1]:
# Basic librarys
import pandas as pd

# A small library to access files from SEC's edgar.
from edgar.company import Company
from edgar.xbrl import XBRL, XBRLElement

In [2]:
df = pd.read_csv('../data/sp1000.csv', dtype='str')
df

Unnamed: 0,ticker,security,sector,cik
0,AAN,Aaron's Inc.,Consumer Discretionary,0000706688
1,AAOI,Applied Optoelect,Information Technology,0001158114
2,AAON,AAON Inc,Industrials,0000824142
3,AAT,American Assets Trust,Real Estate,0001500217
4,AAWW,Atlas Air Worldwide Holdings,Industrials,0001135185
...,...,...,...,...
995,XPER,Xperi Corporation,Information Technology,0001690666
996,Y,Alleghany Corporation,Financials,0000775368
997,ZBRA,Zebra Technologies Corp,Information Technology,0000877212
998,ZEUS,"Olympic Steel, Inc.",Materials,0000917470


In [3]:
securities = []
dicts = []
context_refs = []

for ticker_count, i in enumerate(df.index):
    # To get XBRL data
    security = df.loc[i, 'security']
    company = Company(
        name=security, # company name
        cik=df.loc[i, 'cik'] # company CIK (Central Index Key) number
    )
    results = company.get_data_files_from_10K(
        document_type="EX-101.INS", # Type of document requested
        
        # Number of documents to be retrieved, i.e. FY2009 to FY2018 when 'EX-101.INS' format is applicable
        no_of_documents=10, 
        
        # Default: False. 
        # By default, things aren't case sensitive and is parsed with `html` in `lxml`. 
        # If this is True, then it is parsed with `etree` which is case sensitive
        isxml=True 
    )
    c = 0
    for xbrl in [
        XBRL(xbrl) # Parses data from XBRL
        for xbrl in results
    ]:
        # get children that are not `context`, `unit`, `schemaRef`
        # cleans tags
        for e in xbrl.relevant_children_parsed: 
            c += 1
            dicts.append(XBRLElement(e).to_dict()) # returns a dictionary of name, value, and schemaRef
            try:
                context_refs.append(e.attrib['contextRef'])
            except:
                context_refs.append(None)
    securities += ([security] * c)
    if ticker_count % 10 == 0:  
        print('{} completed.'.format(ticker_count))
        pd.concat(
            (
                pd.Series(securities, name='security'),
                pd.DataFrame(dicts), 
                pd.Series(context_refs, name='context_ref')
            ),
            axis=1
        ).to_csv('../data/edgar_sp1000.csv', index=False)
print('All completed.')
pd.concat(
    (pd.Series(securities, name='security'), pd.DataFrame(dicts), pd.Series(context_refs, name='context_ref')),
    axis=1
).to_csv('../data/edgar_sp1000.csv', index=False)

0 completed.
10 completed.
20 completed.
30 completed.
40 completed.
50 completed.
60 completed.
70 completed.
80 completed.
90 completed.
100 completed.
110 completed.
120 completed.
130 completed.
140 completed.
150 completed.
160 completed.
170 completed.
180 completed.
190 completed.
200 completed.
210 completed.
220 completed.
230 completed.
240 completed.
250 completed.
260 completed.
270 completed.
280 completed.
290 completed.
300 completed.
310 completed.
320 completed.
330 completed.
340 completed.
350 completed.
360 completed.
370 completed.
380 completed.
390 completed.
400 completed.
410 completed.
420 completed.
430 completed.
440 completed.
450 completed.
460 completed.
470 completed.
480 completed.
490 completed.
500 completed.
510 completed.
520 completed.
530 completed.
540 completed.
550 completed.
560 completed.
570 completed.
580 completed.
590 completed.
600 completed.
610 completed.
620 completed.
630 completed.
640 completed.
650 completed.
660 completed.
670 co