In [1]:

import requests
from lxml import html
from bs4 import BeautifulSoup
import re
from datetime import date
import pandas as pd
import requests

### GET TICKER FROM USER (VALIDATION INCLUDED)

In [70]:
#returns the company's CIK
def get_ticker():
    comp = False
    while comp == False:
        comp = input('Please enter the ticker for the company you want to search(required):')
        if comp == '':
            comp = False
        else:
            #get the search page 
            #count = 5 so parse fast since we are just getting the name of company
            baselink = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=&dateb=&owner=exclude&count=5'
            #convert user input to lowercase for search
            comp = comp.lower()
            url = baselink.format(comp)
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')

            #see if no match is found 
            no_match = soup.find_all('h1')

            #if not found 
            if len(list(no_match))!=0:
                print(no_match[0].text)
                print('Please try again')
                comp = False
            #when there is a match, check if right match
            else:
                comp_name = list(soup.find_all('span', class_ = 'companyName'))[0].\
                text.replace('(see all company filings)','')

                right_company = False
                #ask the user if they is what they are looking for
                right_company = False
                while right_company == False:
                    print('')
                    print('')
                    print('Is this the company you are looking for? (Press Y for yes N for no)')
                    right_company = input(comp_name)
                    if right_company.upper() != 'Y' and right_company.upper() !='N':
                        right_company = False
                    elif right_company.upper() == 'N':
                        comp = False
                    else: 
                        return comp_name.split(' ')[-2]


### GET THE DATE SEARCH RANGE A SEPRATE DATE VALIDATION

In [71]:
#check if the date entered is valid 
#returns either False or valid date as pair [start,end]
def date_validation(start, end):
    current_date = int(date.today().strftime('%Y%m%d'))
    
    #ignore if empty start and end
    if start == '' and end == '':
        return ['','']
    #check format length
    if len(start) != 8 or len(end)!=8:
        print('Please check date entered!')
        return False
    
    #see if user entered the right date
    try:
        start_test = pd.to_datetime(start,format='%Y%m%d')
        end_test = pd.to_datetime(end,format='%Y%m%d')
    #when the user enter the wrong content - entering anyting other than numbers 
    except:
        print('Please enter the right date (numbers only)')
        return False
    start = int(start)
    end = int(end)
    #check if reasonable begin &end date
    if start> current_date or start < 19900101 or end <19900101 or end < start:
        print('Please check the year entry. The application do not support search prior to 1990.')
        return False
    else:
        #if user's end year > this year, then end year would be this year
        if end > current_date:
            end = current_date
            return[start,end]
        else:
            right_year = True
            return[start,end]   
        
        
#asking for the end year start year
#return the final search range. format: list[startdate, enddate]
def get_year():
    
    right_year = False
    while right_year == False:
        print('Date format: YYYYMMDD')
        start_year = input('optional - Search Start year (Press enter to pass):')
        end_year = input('optional - Search end year (Press enter to pass):')
        right_year = date_validation(start_year , end_year)
    return right_year

### GET FILE TYPE (10-K ONLY SO FAR)

In [72]:
#get the file type
#so far only returns '10-k'
#returns true or false
def get_file_type():
    filetype = False
    while filetype == False:
        filetype = input ('what document are you looking for?')
        if filetype == '':
            filetype = False
        else:
            if filetype == '10k' or filetype =='10 k' or filetype =='10-k'\
            or filetype =='10K' or filetype =='10 K' or filetype =='10-K':
                return '10-K'
            else:
                print('Please check the file type!')
                filetype = False
    

### GET THE ACTUAL 10K FILES

#### Get the initial search page

In [73]:
#returns a page worth of html
def get_files(CIK, search_range,file_type):

    baselink = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}&owner=exclude&count=100'

    #break date_range into start and end, link only takes dates prior to XXXX
    start_year = search_range[0]
    end_year = search_range[1]
    #fill link
    link = baselink.format(CIK,file_type,end_year)

    #get html content
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

#### get dataframe that contains the file type, link and date

In [74]:
#get the table that contain the information
#returns a dataframe
def get_files_df(soup):
    file_rows = list(soup.find_all('tr'))[3:]

    #create list that contain each file's information
    f_types =[]
    f_links =[]
    f_dates = []
    sec_link = 'http://sec.gov'

    #create list that contain each file's information
    for row in file_rows:

        file_type = row.find_all('td',{'nowrap':'nowrap'})[0].text
        f_types.append(file_type)

        #this gets every item's file link
        file_link = row.find_all('a',href = True)[0]['href']
        file_link = sec_link+file_link
        f_links.append(file_link)

        #gets file date
        file_date = int(row.find_all('td')[3].text.replace('-',''))
        f_dates.append(file_date)

    #construct a table:
    #['file type']: 10-k | ['file link']: sec.gov/Archive/....| ['file date': 19960303]
    files_table = pd.DataFrame({'file type':f_types,'file link':f_links,'file date':f_dates})
    return files_table


#### create a dataframe that filters for the right file types and dates

In [75]:
#filter dates since SEC only allows search with 'prior to'
#filter out non-10k item (e.g. 10-k/a...)
#returns a dataframe
def filtered_df(search_range,file_type,files_table):
    #filter date
    date_filtered = files_table[files_table['file date']>search_range[0]]
    date_filtered
    date_filtered = date_filtered[date_filtered['file date']<search_range[1]]
    #filter file type
    type_filtered = date_filtered[date_filtered['file type']=='10-K']
    return type_filtered

#### get the link to all the 10-k files 

In [76]:
#get links for file 
#returns a lis of links
def get_list_doc_link(filtered_df):
    sec_link = 'http://sec.gov'
    file_links = filtered_df['file link'].tolist()
    doc_links = []
    for URL in file_links:
        page_specific_10k = requests.get(URL)
        soup = BeautifulSoup(page_specific_10k.content, 'html.parser')
        complete_file_link = soup.find_all('table')[0].find_all('tr')[-1].\
                                        find_all('a',href = True)[0]['href']
        doc_full_link = sec_link + complete_file_link
        doc_links.append(doc_full_link)
        return doc_links

#### MAIN FUNCTION THAT USE OTHER FUNCTIONS

In [77]:

#currently returns a list of file links
#should return the process of getting FCFF
def main():
    comp_name = get_ticker()
    search_range = get_year()
    file_type = get_file_type()
    initial_search_html = get_files(comp_name,search_range,file_type)
    file_link_df = get_files_df(initial_search_html)
    filtered = filtered_df(search_range,file_type,file_link_df)
    doc_link_list = get_list_doc_link(filtered)
    
    return doc_link_list
    
    
    
    

In [78]:
#use the following test IF NEEDED
########################################
# CIK = int('0001652044')(googl)
#     search_range = [20170101,20180101]
#     file_type = any variation of 10k works (eg '10k','10 k','10K'....)
######################################
main()

Please enter the ticker for the company you want to search(required):googl


Is this the company you are looking for? (Press Y for yes N for no)
Alphabet Inc. CIK#: 0001652044 y
Date format: YYYYMMDD
optional - Search Start year (Press enter to pass):20170101
optional - Search end year (Press enter to pass):20180101
what document are you looking for?10k


['http://sec.gov/Archives/edgar/data/1652044/000165204417000008/0001652044-17-000008.txt']

# CODE ABOVE THIS LINE WORKS
# RUN EVERYTHING BEFORE 'MAIN()'
# UNFINISHED BELOW THIS LINE

In [79]:
#working on parsing each 10k
doc_links = main()

Please enter the ticker for the company you want to search(required):googl


Is this the company you are looking for? (Press Y for yes N for no)
Alphabet Inc. CIK#: 0001652044 y
Date format: YYYYMMDD
optional - Search Start year (Press enter to pass):20170101
optional - Search end year (Press enter to pass):20180101
what document are you looking for?10k


In [84]:
#writes the html to local
for lk in doc_links:
    t_10k = requests.get(lk)
    file_soup = BeautifulSoup(t_10k.content, 'html.parser')
    test= open("test.html","w")
    test.write(str(file_soup))
    test.close()
