<a href="https://colab.research.google.com/github/kkt86/quant-notebooks/blob/master/nlp_10k_fillings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP on 10k SEC fillings

Original post at https://towardsdatascience.com/nlp-in-the-stock-market-8760d062eb92


In [0]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint

### Implement helper functions from the original project
Original code at https://github.com/roshan-adusumilli/nlp_10-ks/blob/master/project_helper.py

In [2]:
%%bash
pip install ratelimit 

Collecting ratelimit
  Downloading https://files.pythonhosted.org/packages/ab/38/ff60c8fc9e002d50d48822cc5095deb8ebbc5f91a6b8fdd9731c87a147c9/ratelimit-2.2.1.tar.gz
Building wheels for collected packages: ratelimit
  Building wheel for ratelimit (setup.py): started
  Building wheel for ratelimit (setup.py): finished with status 'done'
  Created wheel for ratelimit: filename=ratelimit-2.2.1-cp36-none-any.whl size=5893 sha256=264eb650f18db689a66cb443fd30f7474190e86843b3c3148f3b85ab3ec05819
  Stored in directory: /root/.cache/pip/wheels/05/d9/82/3c6044cf1a54aab9151612458446d9b17a38416869e1b1d9b8
Successfully built ratelimit
Installing collected packages: ratelimit
Successfully installed ratelimit-2.2.1


In [0]:
import matplotlib.pyplot as plt
import requests

from ratelimit import limits, sleep_and_retry

class SecAPI(object):
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}

    @staticmethod
    @sleep_and_retry
    # Dividing the call limit by half to avoid coming close to the limit
    @limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
    def _call_sec(url):
        return requests.get(url)

    def get(self, url):
        return self._call_sec(url).text


def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
    indentation = '  '

    print('[')
    for ten_k in ten_k_data:
        print_statement = '{}{{'.format(indentation)
        for field in fields:
            value = str(ten_k[field])

            # Show return lines in output
            if isinstance(value, str):
                value_str = '\'{}\''.format(value.replace('\n', '\\n'))
            else:
                value_str = str(value)

            # Cut off the string if it gets too long
            if len(value_str) > field_length_limit:
                value_str = value_str[:field_length_limit] + '...'

            print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)

        print_statement += '},'
        print(print_statement)
    print(']')


def plot_similarities(similarities_list, dates, title, labels):
    assert len(similarities_list) == len(labels)

    plt.figure(1, figsize=(10, 7))
    for similarities, label in zip(similarities_list, labels):
        plt.title(title)
        plt.plot(dates, similarities, label=label)
        plt.legend()
        plt.xticks(rotation=90)

    plt.show()


In [7]:
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
cik_lookup = {
    'AMZN': '0001018724',
    'BMY': '0000014272',   
    'CNP': '0001130310',
    'CVX': '0000093410',
    'FL': '0000850209',
    'FRT': '0000034903',
    'HON': '0000773840'
    }

In [0]:
from bs4 import BeautifulSoup

# pull 10-k from SEC
sec_api = SecAPI()

def get_sec_data(cik, doc_type, start=0, count=60):
  rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
  sec_data = sec_api.get(rss_url)
  feed = BeautifulSoup(sec_data.encode("ascii"), "xml").feed
  entries = [(entry.content.find("filing-href").getText(),
              entry.content.find("filing-type").getText(),
              entry.content.find("filing-date").getText()
             ) for entry in feed.find_all("entry", recursive=False)]
  return entries

In [0]:
example_ticker = "AMZN"
sec_data = {}

for ticker, cik in cik_lookup.items():
  sec_data[ticker] = get_sec_data(cik, "10-K") # can switch to 10-Q 

In [14]:
pprint.pprint(sec_data[example_ticker][:5])

[('https://www.sec.gov/Archives/edgar/data/1018724/000101872420000004/0001018724-20-000004-index.htm',
  '10-K',
  '2020-01-31'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872419000004/0001018724-19-000004-index.htm',
  '10-K',
  '2019-02-01'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872418000005/0001018724-18-000005-index.htm',
  '10-K',
  '2018-02-02'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872417000011/0001018724-17-000011-index.htm',
  '10-K',
  '2017-02-10'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872416000172/0001018724-16-000172-index.htm',
  '10-K',
  '2016-01-29')]


In [18]:
# import raw filings
from tqdm import tqdm

raw_filings_by_ticker = {}

for ticker, data in sec_data.items():
  raw_filings_by_ticker[ticker] = {}
  for index_url, file_type, file_date in tqdm(data, desc=f"Downloading {ticker} Fillings", unit=" filling"):
    if (file_type == "10-K"):
      file_url = index_url.replace("-index.htm", ".txt").replace(".txt1", ".txt")
      raw_filings_by_ticker[ticker][file_date] = sec_api.get(file_url)

Downloading AMZN Fillings: 100%|██████████| 25/25 [00:07<00:00,  3.37filling/s]
Downloading BMY Fillings: 100%|██████████| 29/29 [00:09<00:00,  4.84filling/s]
Downloading CNP Fillings: 100%|██████████| 21/21 [00:06<00:00,  3.17filling/s]
Downloading CVX Fillings: 100%|██████████| 27/27 [00:10<00:00,  5.47filling/s]
Downloading FL Fillings: 100%|██████████| 24/24 [00:10<00:00,  2.38filling/s]
Downloading FRT Fillings: 100%|██████████| 31/31 [00:07<00:00,  6.61filling/s]
Downloading HON Fillings: 100%|██████████| 27/27 [00:13<00:00,  2.01filling/s]


In [23]:
print("Example Document \n\n{}".format(next(iter(raw_filings_by_ticker[example_ticker].values()))[:10000]))

Example Document 

<SEC-DOCUMENT>0001018724-20-000004.txt : 20200131
<SEC-HEADER>0001018724-20-000004.hdr.sgml : 20200131
<ACCEPTANCE-DATETIME>20200130204613
ACCESSION NUMBER:		0001018724-20-000004
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		109
CONFORMED PERIOD OF REPORT:	20191231
FILED AS OF DATE:		20200131
DATE AS OF CHANGE:		20200130

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			AMAZON COM INC
		CENTRAL INDEX KEY:			0001018724
		STANDARD INDUSTRIAL CLASSIFICATION:	RETAIL-CATALOG & MAIL-ORDER HOUSES [5961]
		IRS NUMBER:				911646860
		STATE OF INCORPORATION:			DE
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	000-22513
		FILM NUMBER:		20562951

	BUSINESS ADDRESS:	
		STREET 1:		410 TERRY AVENUE NORTH
		CITY:			SEATTLE
		STATE:			WA
		ZIP:			98109
		BUSINESS PHONE:		2062661000

	MAIL ADDRESS:	
		STREET 1:		410 TERRY AVENUE NORTH
		CITY:			SEATTLE
		STATE:			WA
		ZIP:			98109
</SEC-HEADER>
<DOCUMENT>
<TYPE>10-K
<

Document can be identified within the tags \<DOCUMENT> and \</DOCUMENT>

In [31]:
import re

def get_document(text):
  extracted_docs = []

  doc_start_pattern = re.compile(r"<DOCUMENT>")
  doc_end_pattern = re.compile(r"</DOCUMENT>")

  doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
  doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]

  for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
    extracted_docs.append(text[doc_start_i: doc_end_i])

  return extracted_docs

filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_filings_by_ticker.items():
  filling_documents_by_ticker[ticker] = {}
  for file_date, filling in tqdm(raw_fillings.items(), desc=f'Getting Documents from {ticker} Fillings', unit=" fillings"):
    filling_documents_by_ticker[ticker][file_date] = get_document(filling)
                                 
print("/n/n")

Getting Documents from AMZN Fillings: 100%|██████████| 20/20 [00:00<00:00, 85.94 fillings/s]
Getting Documents from BMY Fillings: 100%|██████████| 25/25 [00:00<00:00, 50.22 fillings/s]
Getting Documents from CNP Fillings: 100%|██████████| 17/17 [00:00<00:00, 41.54 fillings/s]
Getting Documents from CVX Fillings: 100%|██████████| 23/23 [00:00<00:00, 43.02 fillings/s]
Getting Documents from FL Fillings: 100%|██████████| 18/18 [00:00<00:00, 63.48 fillings/s]
Getting Documents from FRT Fillings: 100%|██████████| 21/21 [00:00<00:00, 61.03 fillings/s]
Getting Documents from HON Fillings: 100%|██████████| 22/22 [00:00<00:00, 49.89 fillings/s]

/n





In [0]:
# define function to return the type of document given
def get_document_type(doc):
  type_pattern = re.compile(r"<TYPE>[^\n]+")
  doc_type = type_pattern.findall(doc)[0][len("<TYPE>"):]
  return doc_type.lower()

In [36]:
# filter out non 10-K documents

ten_ks_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
  ten_ks_by_ticker[ticker] = []
  for file_date, documents in filling_documents.items():
    for document in documents:
      if get_document_type(document) == "10-k":
        ten_ks_by_ticker[ticker].append({
            "cik": cik_lookup[ticker],
            "file": document,
            "file_date": file_date
        })

print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])

[
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2019123...
    file_date: '2020-01-31'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2018123...
    file_date: '2019-02-01'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2017123...
    file_date: '2018-02-02'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2016123...
    file_date: '2017-02-10'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2015123...
    file_date: '2016-01-29'},
]


## Process data
In the next section, html code is removed and all documents are cleaned

In [38]:
def clean_text(text):
  return BeautifulSoup(text.lower(), "html.parser").get_text() 

for ticker, ten_ks in ten_ks_by_ticker.items():
  for ten_k in tqdm(ten_ks, desc=f"Cleaning {ticker} 10-Ks", unit=" 10-K"):
    ten_k["file_clean"] = clean_text(ten_k["file"])

print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ["file_clean"])


Cleaning AMZN 10-Ks: 100%|██████████| 18/18 [00:32<00:00,  2.36s/ 10-K]
Cleaning BMY 10-Ks: 100%|██████████| 18/18 [01:04<00:00,  2.24s/ 10-K]
Cleaning CNP 10-Ks: 100%|██████████| 17/17 [00:56<00:00,  3.33s/ 10-K]
Cleaning CVX 10-Ks: 100%|██████████| 18/18 [01:37<00:00,  5.41s/ 10-K]
Cleaning FL 10-Ks: 100%|██████████| 18/18 [00:24<00:00,  1.36s/ 10-K]
Cleaning FRT 10-Ks: 100%|██████████| 17/17 [00:48<00:00,  2.83s/ 10-K]
Cleaning HON 10-Ks: 100%|██████████| 19/19 [00:51<00:00,  2.69s/ 10-K]

[
  {
    file_clean: '\n10-k\n1\namzn-20191231x10k.htm\n10-k\n\n\n\n\n\...},
  {
    file_clean: '\n10-k\n1\namzn-20181231x10k.htm\n10-k\n\n\n\n\n\...},
  {
    file_clean: '\n10-k\n1\namzn-20171231x10k.htm\n10-k\n\n\n\n\n\...},
  {
    file_clean: '\n10-k\n1\namzn-20161231x10k.htm\nform 10-k\n\n\n...},
  {
    file_clean: '\n10-k\n1\namzn-20151231x10k.htm\nform 10-k\n\n\n...},
]





## Lemmatize data

In [44]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def lemmatize_words(words):
  lemmatized_words = [WordNetLemmatizer().lemmatize(word, "v") for word in words]

  return lemmatized_words

word_pattern = re.compile("\w+")

for ticker, ten_ks in ten_ks_by_ticker.items():
  for ten_k in tqdm(ten_ks, desc=f"Lemmatize {ticker} 10-Ks", unit=" 10-K"):
    ten_k["file_lemma"] = lemmatize_words(word_pattern.findall(ten_k["file_clean"]))

print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ["file_lemma"])

Lemmatize AMZN 10-Ks: 100%|██████████| 18/18 [00:03<00:00,  4.83 10-K/s]
Lemmatize BMY 10-Ks: 100%|██████████| 18/18 [00:06<00:00,  3.52 10-K/s]
Lemmatize CNP 10-Ks: 100%|██████████| 17/17 [00:06<00:00,  2.51 10-K/s]
Lemmatize CVX 10-Ks: 100%|██████████| 18/18 [00:05<00:00,  3.43 10-K/s]
Lemmatize FL 10-Ks: 100%|██████████| 18/18 [00:03<00:00,  5.74 10-K/s]
Lemmatize FRT 10-Ks: 100%|██████████| 17/17 [00:04<00:00,  4.53 10-K/s]
Lemmatize HON 10-Ks: 100%|██████████| 19/19 [00:04<00:00,  5.50 10-K/s]

[
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20191231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20181231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20171231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20161231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20151231x10k', 'htm', '...},
]





## Remove stopwords

In [47]:
from nltk.corpus import stopwords

lemma_english_stopwords = lemmatize_words(stopwords.words("english"))

for ticker, ten_ks in ten_ks_by_ticker.items():
  for ten_k in tqdm(ten_ks, desc=f"Remove stop words for {ticker} 10-Ks", unit=" 10-K"):
    ten_k["file_lemma"] = [word for word in ten_k["file_lemma"] if word not in lemma_english_stopwords]

Remove stop words for AMZN 10-Ks: 100%|██████████| 18/18 [00:01<00:00, 12.47 10-K/s]
Remove stop words for BMY 10-Ks: 100%|██████████| 18/18 [00:02<00:00,  6.49 10-K/s]
Remove stop words for CNP 10-Ks: 100%|██████████| 17/17 [00:02<00:00,  5.83 10-K/s]
Remove stop words for CVX 10-Ks: 100%|██████████| 18/18 [00:02<00:00,  8.05 10-K/s]
Remove stop words for FL 10-Ks: 100%|██████████| 18/18 [00:01<00:00, 14.38 10-K/s]
Remove stop words for FRT 10-Ks: 100%|██████████| 17/17 [00:01<00:00, 10.17 10-K/s]
Remove stop words for HON 10-Ks: 100%|██████████| 19/19 [00:01<00:00, 12.39 10-K/s]
