In [1]:
# %pip install sec-api
# %pip install pdfkit
# %pip install wkhtmltopdf

In [47]:
from sec_api import ExtractorApi
import pdfkit
import pandas as pd
import os
from sec_api import RenderApi
from pathlib import Path
import multiprocessing

In [7]:
os. getcwd()

'/Users/satoshiido/Documents/programming/RA/10k_filling'

In [21]:
# load Russell 3000 holdings CSV into a dataframe
holdings = pd.read_csv('./mikecik_2015/mikecik_2015.csv')
display(holdings.head())

Unnamed: 0,cik,Ticker
0,1090872,A
1,815094,ABMD
2,1427925,ACRX
3,926282,ADTN
4,1506928,AVGR


Unnamed: 0,cik,Ticker
45,1429560,TRVN
46,1519061,TSE
47,1105705,TWX
48,1124610,VMW
49,105770,WST


In [78]:
# set api_key and QueryApi to search and filter filings later
from sec_api import QueryApi
api_key = "7caf4a7647d0fe6dd25bad53faa1028f2c47a4633a31b6d6d7c7f19a2ae8a575"
queryApi = QueryApi(api_key=api_key)

In [79]:
# create batches of tickers: [[A,B,C], [D,E,F], ...]
# a single batch has a maximum of max_length_of_batch tickers
def create_batches(tickers = [], max_length_of_batch = 100):
  batches = [[]]

  for ticker in tickers:
    if len(batches[len(batches)-1]) == max_length_of_batch:
      batches.append([])

    batches[len(batches)-1].append(ticker)

  return batches

batches = create_batches(list(holdings['Ticker']))


In [80]:
# Create metadata which contains ticker, cik, filedAt, and filingUrl

def download_10K_metadata(tickers = [], start_year = 2015, end_year = 2015):
  if Path('./mikecik_2015/metadata.csv').is_file():
    result = None
    result = pd.read_csv('./mikecik_2015/metadata.csv')
    return result

  print('Starting download process')

  # create ticker batches, with 100 tickers per batch
  batches = create_batches(tickers)
  frames = []

  for year in range(start_year, end_year + 1):
    for batch in batches:
      tickers_joined = ', '.join(batch)
      ticker_query = 'ticker:({})'.format(tickers_joined)

      query_string = '{ticker_query} AND filedAt:[{start_year}-01-01 TO {end_year}-12-31] AND formType:"10-K" AND NOT formType:"10-K/A" AND NOT formType:NT'.format(ticker_query=ticker_query, start_year=year, end_year=year)

      query = {
        "query": { "query_string": { 
            "query": query_string,
            "time_zone": "America/New_York"
        } },
        "from": "0",
        "size": "200",
        "sort": [{ "filedAt": { "order": "desc" } }]
      }

      response = queryApi.get_filings(query)

      filings = response['filings']

      metadata = list(map(lambda f: {'ticker': f['ticker'], 
                                     'cik': f['cik'], 
                                     'formType': f['formType'], 
                                     'filedAt': f['filedAt'], 
                                     'filingUrl': f['linkToFilingDetails']}, filings))

      df = pd.DataFrame.from_records(metadata)

      frames.append(df)

    print('Downloaded metadata for year', year)


  result = pd.concat(frames)
  result.to_csv('metadata.csv', index=False)

  number_metadata_downloaded = len(result)
  print('Downloaded completed. Metadata downloaded for {} filings.'.format(number_metadata_downloaded))

  return result

In [81]:
tickers = list(holdings['Ticker'])

# specify start year to end year
start = 2015
end = 2015

# create metadata
metadata = download_10K_metadata(tickers=tickers, start_year=start, end_year=end)
metadata.head()

Unnamed: 0,ticker,cik,formType,filedAt,filingUrl
0,A,1090872,10-K,2015-12-18T19:33:31-05:00,https://www.sec.gov/Archives/edgar/data/109087...
1,NDSN,72331,10-K,2015-12-15T11:36:58-05:00,https://www.sec.gov/Archives/edgar/data/72331/...
2,RSTI,1019361,10-K,2015-11-30T16:41:39-05:00,https://www.sec.gov/Archives/edgar/data/101936...
3,BDX,10795,10-K,2015-11-25T11:35:36-05:00,https://www.sec.gov/Archives/edgar/data/10795/...
4,FFIV,1048695,10-K,2015-11-06T13:28:33-05:00,https://www.sec.gov/Archives/edgar/data/104869...


In [82]:
# convert filedAt column to datetime format and extract year
metadata['filedAt'] = pd.to_datetime(metadata['filedAt'], utc=True).dt.tz_convert(None)
metadata["year"] = metadata['filedAt'].dt.year

In [83]:
# this part below is where we want to change using extractorApi
import requests
import datetime

# Filing Render & Download API:
# api key
## https://sec-api.io/login
## https://github.com/janlukasschroeder/sec-api-python/blob/master/README.md#filing-render--download-api
## https://pypi.org/project/sec-api/#filing-render--download-api
extractorApi = ExtractorApi("7caf4a7647d0fe6dd25bad53faa1028f2c47a4633a31b6d6d7c7f19a2ae8a575")


# this part below is also the part where we want to change (we want to convert html to pdf file)
for i in range(0, len(metadata)):
    # get the original HTML of section 1 "Business"
    section1_html = extractorApi.get_section(metadata["filingUrl"].iloc[i], "1", "html")
    # get the original HTML of section 1A "Risk Factors"
    section1a_html = extractorApi.get_section(metadata["filingUrl"].iloc[i], "1A", "html")
    
    # combine the HTML content into one large HTML string
    combined_html = section1_html + section1a_html

    # set output filename for the PDF
    year = str(metadata["year"].iloc[i])
    cik = str(metadata["cik"].iloc[i])
    ## change file_name below
    file_name = "section1_1a"
    output_pdf = f"{year}-{cik}-{file_name}.pdf"

    # create pdf file with specified name
    pdfkit.from_string(combined_html, output_pdf)
    
    print(f"The PDF file has been created: {output_pdf}")


The PDF file has been created: 2015-1090872-section1_1a.pdf
The PDF file has been created: 2015-72331-section1_1a.pdf
The PDF file has been created: 2015-1019361-section1_1a.pdf
The PDF file has been created: 2015-10795-section1_1a.pdf
The PDF file has been created: 2015-1048695-section1_1a.pdf
The PDF file has been created: 2015-836429-section1_1a.pdf
The PDF file has been created: 2015-1474439-section1_1a.pdf
The PDF file has been created: 2015-55242-section1_1a.pdf
The PDF file has been created: 2015-789019-section1_1a.pdf
The PDF file has been created: 2015-709283-section1_1a.pdf
The PDF file has been created: 2015-815094-section1_1a.pdf
The PDF file has been created: 2015-946563-section1_1a.pdf
The PDF file has been created: 2015-1493761-section1_1a.pdf
The PDF file has been created: 2015-1506928-section1_1a.pdf
The PDF file has been created: 2015-354950-section1_1a.pdf
The PDF file has been created: 2015-1429560-section1_1a.pdf
The PDF file has been created: 2015-1253689-section1