In [11]:

import requests
from lxml import html
from bs4 import BeautifulSoup
import re
from datetime import date
import pandas as pd
import requests

### GET TICKER FROM USER (VALIDATION INCLUDED)

In [12]:
#returns the company's CIK
def get_ticker():
    comp = False
    while comp == False:
        comp = input('Please enter the ticker for the company you want to search(required):')
        if comp == '':
            comp = False
        else:
            #get the search page 
            #count = 5 so parse fast since we are just getting the name of company
            baselink = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=&dateb=&owner=exclude&count=5'
            #search lowercase 
            comp = comp.lower()
            url = baselink.format(comp)
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')

            #see if no match is found 
            no_match = soup.find_all('h1')

            #if not found 
            if len(list(no_match))!=0:
                print(no_match[0].text)
                print('Please try again')
                comp = False
            #when there is a match, check if right match
            else:
                comp_name = list(soup.find_all('span', class_ = 'companyName'))[0].\
                text.replace('(see all company filings)','')

                right_company = False
                #ask the user if they is what they are looking for
                right_company = False
                while right_company == False:
                    print('')
                    print('')
                    print('Is this the company you are looking for? (Press Y for yes N for no)')
                    right_company = input(comp_name)
                    if right_company.upper() != 'Y' and right_company.upper() !='N':
                        right_company = False
                    elif right_company.upper() == 'N':
                        comp = False
                    else: 
                        return comp_name.split(' ')[-2]


### GET THE DATE SEARCH RANGE A SEPRATE DATE VALIDATION

In [13]:
#check if the date entered is valid 
#returns either False or valid date as pair [start,end]
def date_validation(start, end):
    current_date = int(date.today().strftime('%Y%m%d'))
    
    #ignore if empty start and end
    if start == '' and end == '':
        return ['','']
    #check format length
    if len(start) != 8 or len(end)!=8:
        print('Please check date entered!')
        return False
    
    #see if user entered the right date
    try:
        start_test = pd.to_datetime(start,format='%Y%m%d')
        end_test = pd.to_datetime(end,format='%Y%m%d')
    #when the user enter the wrong content - entering anyting other than numbers 
    except:
        print('Please enter the right date (numbers only)')
        return False
    start = int(start)
    end = int(end)
    #check if reasonable begin &end date
    if start> current_date or start < 20090501 or end <20090501 or end < start:
        print('Please check the year entry. The application do not support search prior to May 2009.')
        return False
    else:
        #if user's end year > this year, then end year would be this year
        if end > current_date:
            end = current_date
            return[start,end]
        else:
            right_year = True
            return[start,end]   
        
        
#asking for the end year start year
#return the final search range. format: list[startdate, enddate]
def get_year():
    
    right_year = False
    while right_year == False:
        print('Date format: YYYYMMDD')
        start_year = input('optional - Search Start year (Press enter to pass):')
        end_year = input('optional - Search end year (Press enter to pass):')
        right_year = date_validation(start_year , end_year)
    return right_year

### GET FILE TYPE (10-K ONLY SO FAR)

In [14]:
#get the file type
#so far only returns '10-k'
#returns true or false
def get_file_type():
    filetype = False
    while filetype == False:
        filetype = input ('what document are you looking for?')
        if filetype == '':
            filetype = False
        else:
            if filetype == '10k' or filetype =='10 k' or filetype =='10-k'\
            or filetype =='10K' or filetype =='10 K' or filetype =='10-K':
                return '10-K'
            else:
                print('Please check the file type!')
                filetype = False
    

### GET THE ACTUAL 10K FILES

#### Get the initial search page

In [15]:
#returns a page worth of html
def get_files(CIK, search_range,file_type):

    baselink = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}&owner=exclude&count=100'

    #break date_range into start and end, link only takes dates prior to XXXX
    start_year = search_range[0]
    end_year = search_range[1]
    #fill link
    link = baselink.format(CIK,file_type,end_year)

    #get html content
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

#### get dataframe that contains the file type, link and date

In [16]:
#get the table that contain the information
#returns a dataframe
def get_files_df(soup,search_range):
    file_rows = list(soup.find_all('tr'))[3:]

    #create list that contain each file's information
    f_types =[]
    f_links =[]
    f_dates = []
    sec_link = 'http://sec.gov'

    #create list that contain each file's information
    for row in file_rows:
        
        #gets file date
        file_type = row.find_all('td',{'nowrap':'nowrap'})[0].text
        file_date = row.find_all('td')[3].text.replace('-','')
        file_link = sec_link+row.find_all('td',{'nowrap':'nowrap'})[1].find_all('a', href = True)[0]['href']

        #this gets every item's file link
        if int(file_date) >int(search_range[0]) and int(file_date)<int(search_range[1]) and file_type == '10-K':
            f_types.append(file_type)
            f_dates.append(file_date)
            f_links.append(file_link)


    #construct a table:
    #['file type']: 10-k | ['file link']: sec.gov/Archive/....| ['file date': 19960303]
    files_table = pd.DataFrame({'file type':f_types,'file link':f_links,'file date':f_dates})
    return files_table


#### create a dataframe that filters for the right file types and dates

In [17]:
#filter dates since SEC only allows search with 'prior to'
#filter out non-10k item (e.g. 10-k/a...)
#returns a dataframe
def filtered_df(search_range,file_type,files_table):
#     #filter date
#     date_filtered = files_table[files_table['file date']>search_range[0]]
#     date_filtered
#     date_filtered = date_filtered[date_filtered['file date']<search_range[1]]
#     #filter file type
#     type_filtered = date_filtered[date_filtered['file type']=='10-K']
    return files_table

#### get the link to all the 10-k files 

In [18]:
#get links for file 
#returns a list of links
def get_list_doc_link_text(filtered_df):
    sec_link = 'http://sec.gov'
    file_links = filtered_df['file link'].tolist()
    doc_links = []
    for URL in file_links:
        page_specific_10k = requests.get(URL)
        soup = BeautifulSoup(page_specific_10k.content, 'html.parser')
        complete_file_link = soup.find_all('table')[0].find_all('tr')[-1].\
                                        find_all('a',href = True)[0]['href']
        doc_full_link = sec_link + complete_file_link
        doc_links.append(doc_full_link)
    return doc_links
def get_link(filtered_df):
    return filtered_df['file link']

#### MAIN FUNCTION THAT USE OTHER FUNCTIONS

In [19]:

#currently returns a list of file links
#should return the process of getting FCFF
def main():
    comp_name = get_ticker()
    search_range = get_year()
    file_type = get_file_type()
    initial_search_html = get_files(comp_name,search_range,file_type)
    file_link_df = get_files_df(initial_search_html,search_range)
    filtered = filtered_df(search_range,file_type,file_link_df)
    files_list_links = get_list_doc_link_text(filtered)
    #links = get_link(filtered)
    
    
    return files_list_links
    
    
    
    

In [20]:
#use the following test IF NEEDED
########################################
# CIK = int('0001652044')(googl)
#     search_range = [20170101,20180101]
#     file_type = any variation of 10k works (eg '10k','10 k','10K'....)
######################################
# main()

KeyboardInterrupt: 

## CODE ABOVE THIS LINE WORKS
# RUN EVERYTHING BEFORE 'MAIN()'
# UNFINISHED BELOW THIS LINE

In [21]:
#working on parsing each 10k
# doc_links = main()
# print (doc_links)

Please enter the ticker for the company you want to search(required):amzn


Is this the company you are looking for? (Press Y for yes N for no)
AMAZON COM INC CIK#: 0001018724 y
Date format: YYYYMMDD
optional - Search Start year (Press enter to pass):20170101
optional - Search end year (Press enter to pass):20180101
what document are you looking for?10k
['http://sec.gov/Archives/edgar/data/1018724/000101872417000011/0001018724-17-000011.txt']


In [1]:
# #writes the html to local
# for lk in doc_links:
#     t_10k = requests.get(lk)
#     file_soup = BeautifulSoup(t_10k.content, 'html.parser')
#     name_format = '{}.html'
#     name = name_format.format(i)
#     test= open(name,"w")
#     test.write(str(file_soup))
#     test.close()


NameError: name 'doc_links' is not defined

In [22]:
# comp_name = get_ticker()
# search_range = get_year()
# file_type = get_file_type()
# initial_search_html = get_files(comp_name,search_range,file_type)
# soup = initial_search_html



Please enter the ticker for the company you want to search(required):amzn


Is this the company you are looking for? (Press Y for yes N for no)
AMAZON COM INC CIK#: 0001018724 y
Date format: YYYYMMDD
optional - Search Start year (Press enter to pass):20170101
optional - Search end year (Press enter to pass):20180101
what document are you looking for?10k


In [23]:

# file_rows = list(soup.find_all('tr'))[3:]
# sec_link = 'http://sec.gov'
# f_links = []

# for row in file_rows:
#     file_type = row.find_all('td',{'nowrap':'nowrap'})[0].text
#     file_date = row.find_all('td')[3].text
#     file_link = sec_link+row.find_all('td',{'nowrap':'nowrap'})[1].find_all('a', href = True)[0]['href']
        


# print (file_link)



http://sec.gov/Archives/edgar/data/1018724/0000891020-98-000448-index.html


# New test starts here
the following get to the four/five financial statements in html. the last cell will write these statement into .html for testing purposes

In [70]:
link = main()
print(link)

Please enter the ticker for the company you want to search(required):amzn


Is this the company you are looking for? (Press Y for yes N for no)
AMAZON COM INC CIK#: 0001018724 y
Date format: YYYYMMDD
optional - Search Start year (Press enter to pass):20150101
optional - Search end year (Press enter to pass):20160101
what document are you looking for?10k
['http://sec.gov/Archives/edgar/data/1018724/000101872415000006/0001018724-15-000006.txt']


In [71]:
import urllib.request
broken_xml = urllib.request.urlopen(link[0]).read().decode('utf-8')
file_soup = BeautifulSoup(broken_xml, 'html.parser')
len(broken_xml)

15432046

In [72]:
# tables = file_soup.find_all('table')[8]

In [73]:

# finstat =  tables.find(lambda tag:tag.name=="tr" and "Financial Statements" in tag.text)

In [128]:
##### split page on the devider line
pages = broken_xml.split('page-break')
len(pages)

78

In [129]:
documents = ['Consolidated Statements of Cash Flows','Consolidated Statements of Operations', 'Consolidated Statements of Comprehensive Income', 'Consolidated Balance Sheets', 'Consolidated Statements of Stockholders’ Equity','Notes to Consolidated Financial Statements']

In [307]:
fin_document_dict = {}
doc_list = []
for page in pages:
    idx = pages.index(page)
    found = 0
    for i in documents:
        if i.upper() in page:
            key = i
            found = +1
    if found==1:
        pair = [key,page]
        doc_list.append(pair)

In [308]:
from IPython.display import display_html
statements = []
for pair in doc_list:
    name = pair[0]
    page = pair[1]
    
    #parse page
    file_soup = BeautifulSoup(page, 'html.parser')
    table = str(file_soup.find_all('table'))
    df = pd.read_html(table)[0].iloc[3:,:].dropna(axis = 1, how = 'all').fillna(0)
    head = df.iloc[0]
    df = df[1:]
    df.columns = head
    df[0] = df[0].apply(lambda x:x.replace('(','').replace(')','').replace('','').replace(',','').replace(';','').replace('0',''))
    df = df.set_index(0)
    pair = [name,df]
    statements.append(pair)
    
    
    
    #break
    

In [309]:
clean_statements = []
for pair in statements:
    name = pair[0]
    table = pair[1]
    column_number = int(len(table.columns)/3)
    column_name = table.columns.drop_duplicates().tolist()
    column_index = table.index.tolist()
    clean = pd.DataFrame()
    clean['index'] = column_index
    clean = clean.set_index('index')
    for i in range(0,column_number):
        which_column = (i+1)*3-2
        content = table.iloc[:,which_column].values
        column_name_clean = column_name[i]
        clean[column_name_clean] = content
    
    pair = [name,clean]
    clean_statements.append(pair)
        
    
    
    

# Run the previous code
clean statements is a nested list. the structure look like this
clean statement = 
[
                 [statement's name,  statement dataframe(which contains all the numbers]
                 ......
                 ]

In [312]:
# statements[0].iloc[:,1]
clean_statements[3]

['Consolidated Balance Sheets',
                                               2014    2013
 index                                                     
 ASSETS                                           0       0
 Current assets:                                  0       0
 Cash and cash equivalents                    14557    8658
 Marketable securities                         2859    3789
 Inventories                                   8299    7411
 Accounts receivable net and other             5612    4767
 Total current assets                         31327   24625
 Property and equipment net                   16967   10949
 Goodwill                                      3319    2655
 Other assets                                  2892    1930
 Total assets                                 54505   40159
 LIABILITIES AND STOCKHOLDERS’ EQUITY             0       0
 Current liabilities:                             0       0
 Accounts payable                             16459   15133
 Accrued

ImportError: html5lib not found, please install it

In [44]:
# # write to html as test
# file = open('test.html','w')
# for i in temp_list:
#     file.write(i)
# file.close()