# 10-K Risk Section

[CIK Lookup](https://www.sec.gov/cgi-bin/cik_lookup)

* Microsoft - 0000789019
* Tesla - 0001318605
* Google, Inc - 0001288776
* Walmart Stores - 0000217476
* Amazon - 0001018724
* Exxon Mobile - 0000034088
* CVS Health - 0000064803
* Berkshire Hathaway Corp - 0001067983
* Alphabet, Inc - 0001652044

[NASDAQ CSV](https://www.nasdaq.com/market-activity/stocks/screener)

[S&P 500](https://en.wikipedia.org/wiki/S%26P_500)
[S&P 500 Investopedia](https://www.investopedia.com/ask/answers/040215/what-does-sp-500-index-measure-and-how-it-calculated.asp)

* The Standard and Poor's 500, or simply the S&P 500, is a stock market index tracking the stock performance of 500 large companies listed on exchanges in the United States.
* It is one of the most commonly followed equity indices. As of December 31, 2020, more than $5.4 trillion was invested in assets tied to the performance of the index.
* It is a free-float weighted/capitalization-weighted index

* As of September 30, 2021, the nine largest companies on the list of S&P 500 companies accounted for 28.1% of the market capitalization of the index and were, in order of weighting, Apple, Microsoft,Alphabet (including both class A & C shares), Amazon.com, Meta Platforms, Tesla, Nvidia, Berkshire Hathaway and JPMorgan Chase
* The components that have increased their dividends in 25 consecutive years are known as the S&P 500 Dividend Aristocrats.
* In 2017, companies in the index derived on average 72% of their revenue in the United States.
* The index is one of the factors in computation of the Conference Board Leading Economic Index, used to forecast the direction of the economy.
* The index is associated with many ticker symbols, including ^GSPC, INX, and $SPX, depending on market or website.
* The S&P 500 is maintained by S&P Dow Jones Indices, a joint venture majority-owned by S&P Global, and its components are selected by a committee.

In [53]:
import os
import re
from glob import glob
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sec_cik_mapper import StockMapper
from pathlib import Path

nltk.download('punkt')
nltk.download('stopwords')

import numpy as np
import yfinance as yf
from pandas_datareader import data as pdr

#Create the downloader
# dl = Downloader()

#Download 5 years of 10k documents
# dl.get("10-K", "0001341439", after="2014-01-01", before="2022-07-31")

In [54]:
DATA_DIR = os.path.join(os.environ["HOME"], "Datasets",
                        "Financial","SEC_10K")

ML_DATA_DIR = os.path.join(DATA_DIR, "ML_Dataset")

TXT_10K_DIR = os.path.join(DATA_DIR, "sec_10k_text_files")
# HTML_10K_DIR = os.path.join(DATA_DIR, "sec_10k_html_files")

TXT_10K_FILES = sorted(glob(os.path.join(TXT_10K_DIR, "*.txt")))

# HTML_10K_FILES = sorted(glob(os.path.join(HTML_10K_DIR, "*.html")))
# HTML_10K_FILES[:5]

# Test Sample Docs

In [None]:
sample_company = TXT_10K_FILES[10]
# sample_company = HTML_10K_FILES[10]
sample_company

In [None]:
single_filepath = '/Users/jonghang/Datasets/Financial/SEC_10K/sec_10k_text_files/0000001961_2017.txt'
singlefile = os.path.basename(single_filepath)
fileparts = singlefile.split('_')
cik = fileparts[0]
year = fileparts[1].replace('.txt','')
print(cik, ' ', year)

In [None]:
from bs4 import BeautifulSoup

docs_in_sample_company = glob(os.path.join(sample_company, "**/*.txt"), recursive=True)

#Read in text files
sample_docs = []
for single_file in TXT_10K_FILES[100:110]:
    with open(single_file, 'rb') as f:
        #Convert to html then read in the text piece
        singlefile = os.path.basename(single_file)
        fileparts = singlefile.split('_')
        cik = fileparts[0]
        year = fileparts[1].replace(".txt","")
        sample_docs.append(BeautifulSoup(f.read()).text)

In [None]:
# Check first document
sample_text = sample_docs[2]
print(sample_text[:30000])

In [None]:
# Prepare regular expression
pattern = r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))'
regex = re.compile(pattern)
# Find matches
matches = regex.finditer(sample_text)

# Write a for loop to print the matches
for match in matches:
    print(match)

In [None]:
# Matches
matches = regex.finditer(sample_text)
# Create the dataframe
regex_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
regex_df.columns = ['item', 'start', 'end']
regex_df['item'] = regex_df.item.str.upper()

# Display the dataframe
regex_df.head()

In [None]:
# Get rid of unnesesary charcters from the dataframe
regex_df.replace('&#160;',' ',regex=True,inplace=True)
regex_df.replace('&nbsp;',' ',regex=True,inplace=True)
regex_df.replace(' ','',regex=True,inplace=True)
regex_df.replace('\.','',regex=True,inplace=True)
regex_df.replace('>','',regex=True,inplace=True)

# display the dataframe
regex_df.head()

In [None]:
# Drop duplicates
pos_df = regex_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
# Set item as the dataframe index
pos_df.set_index('item', inplace = True)

# Display the dataframe
pos_df

In [None]:
# Get Item 1a raw content
item_1a_raw = sample_text[pos_df['start'].loc['ITEM1A']:pos_df['start'].loc['ITEM1B']]
# Get Item 7 raw content
item_7_raw = sample_text[pos_df['start'].loc['ITEM7']:pos_df['start'].loc['ITEM7A']]
# Get Item 7a raw content
item_7a_raw = sample_text[pos_df['start'].loc['ITEM7A']:pos_df['start'].loc['ITEM8']]

item_1a_raw

# Apply BeautifulSoup to refine the content

In [None]:
# Create bs4 object from the raw text
item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
item_7_content = BeautifulSoup(item_7_raw, 'lxml')
item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')

# Further organize into a proper tree structure by applying .pretiffy()
print(item_1a_content.prettify()[0:1000])

In [None]:
### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text
### more cleanly, it's basically new line character between sections.
print(item_1a_content.get_text("\n\n")[0:1500])

In [None]:
processed_text = item_1a_content.get_text().strip().replace('\n',' ').replace('\t', ' ')
print(processed_text[:1500])

# Assemble Functions

## Clean Text using NLTK

In [57]:
def clean_text(input_text):
    stemmer = nltk.stem.SnowballStemmer('english')
    input_text = stemmer.stem(input_text)
    tokens = word_tokenize(input_text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    words = words[5:]
    cleaned_sentense = ' '.join(words)
    # the first 5 words are heading, remove them
    return cleaned_sentense


## Assemble NLP Table for ML

In [58]:
def build_single_10k_table(input_cik, input_year, input_text):
    # Write the regex
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')
    # Use finditer to math the regex
    matches = regex.finditer(input_text)

    # Create the dataframe
    regex_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
    regex_df.columns = ['item', 'start', 'end']
    regex_df['item'] = regex_df.item.str.upper()

    # clean up the dataframe by removing special characters
    regex_df.replace('&#160;',' ',regex = True,inplace = True)
    regex_df.replace('&nbsp;',' ',regex = True,inplace = True)
    regex_df.replace(' ','',regex = True, inplace = True)
    regex_df.replace('\.','',regex = True, inplace = True)
    regex_df.replace('>','',regex = True, inplace = True)

    # Drop duplicates
    pos_df = regex_df.sort_values('start', ascending = True).drop_duplicates(subset = ['item'], keep = 'last')
    # Set item as the dataframe index
    pos_df.set_index('item', inplace = True)

    # display the dataframe
    # display(regex_df.head())

    # Get Item 1a
    item_1a_raw = input_text[pos_df['start'].loc['ITEM1A']:pos_df['start'].loc['ITEM1B']]
    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
    processed_item1a = item_1a_content.get_text().strip().replace('\n',' ').replace('\t', ' ').lower()
    cleaned_item1a = clean_text(processed_item1a)
    # Get Item 7
    item_7_raw = input_text[pos_df['start'].loc['ITEM7']:pos_df['start'].loc['ITEM7A']]
    item_7_content = BeautifulSoup(item_7_raw, 'lxml')
    processed_item7 = item_7_content.get_text().strip().replace('\n',' ').replace('\t', ' ').lower()
    cleaned_item7 = clean_text(processed_item7)
    # Get Item 7a
    item_7a_raw = input_text[pos_df['start'].loc['ITEM7A']:pos_df['start'].loc['ITEM8']]
    item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')
    processed_item7a = item_7a_content.get_text().strip().replace('\n',' ').replace('\t', ' ').lower()
    cleaned_item7a = clean_text(processed_item7a)

    single_dict = {"CIK":[input_cik], "YEAR": input_year, "ITEM_1A":[processed_item1a],
                   "ITEM_7":[processed_item7], "ITEM_7A":[processed_item7a],
                   "CLEANED_ITEM_1A":[cleaned_item1a],
                   "CLEANED_ITEM_7":[cleaned_item7],
                   "CLEANED_ITEM_7A":[cleaned_item7a]}
    single_df_10k = pd.DataFrame(single_dict)
    return single_df_10k


# Build ML Dataset
## Functions

In [59]:
from bs4 import BeautifulSoup

def build_nlp_10k_ml_dataset(text_10k_filepaths, start_idx = 0):
    cleaned_10k_dfs = []

    for single_file in text_10k_filepaths[start_idx:]:
        with open(single_file, 'rb') as f:
            #Convert to html then read in the text piece
            singlefile = os.path.basename(single_file)
            fileparts = singlefile.split('_')
            cik = fileparts[0]
            year = fileparts[1].replace(".txt","")
            single_df = build_single_10k_table(cik,
                                               year,
                                               BeautifulSoup(f.read()).text)
            cleaned_10k_dfs.append(single_df)
    return cleaned_10k_dfs

## Assemble Candidates ML Dataset

In [61]:
TXT_10K_CANDIDATES_FILES = glob(os.path.join(DATA_DIR, "sec_10k_text_files_CANDIDATES","*.txt"))
# print(TXT_10K_CANDIDATES_FILES[:5])

candidates_ml_dfs = build_nlp_10k_ml_dataset(TXT_10K_CANDIDATES_FILES, start_idx=0)
candidates_ml_df = pd.concat(candidates_ml_dfs, axis = 0)

# Create id column of CIK_YEAR

candidates_ml_df.insert(loc = 0,
                        column = "ID_CIK_YEAR",
                        value = "ID_" + candidates_ml_df["CIK"] + "_" + candidates_ml_df["YEAR"])

display(candidates_ml_df.shape)
display(candidates_ml_df.head())

(1925, 9)

Unnamed: 0,ID_CIK_YEAR,CIK,YEAR,ITEM_1A,ITEM_7,ITEM_7A,CLEANED_ITEM_1A,CLEANED_ITEM_7,CLEANED_ITEM_7A
0,ID_0001637207_2020,1637207,2020,item 1a.risk factorsthe following is a descrip...,item 7. management’s discussion and anal...,item 7a.quantitative and qualitative disclosur...,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...
0,ID_0001300699_2018,1300699,2018,item 1a. risk factors the foll...,item 7. management's discussion and analysi...,item 7a. quantitative and qualitative discl...,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...
0,ID_0001227500_2018,1227500,2018,item 1a. risk factors the foll...,item 7. management's discussion and analysi...,item 7a. quantitative and qualitative discl...,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...
0,ID_0001653653_2020,1653653,2020,item 1a.risk factorsthe following is a descrip...,item 7. management’s discussion and anal...,item 7a.quantitative and qualitative disclosur...,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...
0,ID_0000007323_2017,7323,2017,item 1a. risk factors the foll...,item 7. management's discussion and analysi...,item 7a. quantitative and qualitative discl...,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk dollars millions exposure market risk inc...


**Write to CSV**

In [62]:
# candidates_ml_df.to_csv(os.path.join(DATA_DIR, "ML_Dataset", "SEC_10K_Candidates.csv"), index = False)

In [63]:
# Write NLTKly parsed data
candidates_ml_cleaned_df = candidates_ml_df[["ID_CIK_YEAR","CIK","YEAR",
                                   "CLEANED_ITEM_1A","CLEANED_ITEM_7",
                                   "CLEANED_ITEM_7A"]]
candidates_ml_cleaned_df.to_csv(os.path.join(DATA_DIR, "ML_Dataset", "SEC_10K_Candidates_Cleaned.csv"), index = False)

## Assemble S&P 500 ML Dataset

In [65]:
TXT_10K_SP500_FILES = glob(os.path.join(DATA_DIR, "sec_10k_text_files_SP500","*.txt"))
# print(TXT_10K_CANDIDATES_FILES[:5])

sp500_ml_dfs = build_nlp_10k_ml_dataset(TXT_10K_SP500_FILES, start_idx=0)
# combine all DFs into a single dataframe
sp500_ml_df = pd.concat(sp500_ml_dfs, axis = 0)

# add id column
sp500_ml_df.insert(loc = 0,
                    column = "ID_CIK_YEAR",
                    value = "ID_" + sp500_ml_df["CIK"] + "_" + sp500_ml_df["YEAR"])

# Add target column
sp500_ml_df["SP_500"] = 1

display(sp500_ml_df.shape)
display(sp500_ml_df.head())

(2480, 10)

Unnamed: 0,ID_CIK_YEAR,CIK,YEAR,ITEM_1A,ITEM_7,ITEM_7A,CLEANED_ITEM_1A,CLEANED_ITEM_7,CLEANED_ITEM_7A,SP_500
0,ID_0001335258_2020,1335258,2020,item 1a.risk factorsthe following is a descrip...,item 7. management’s discussion and anal...,item 7a.quantitative and qualitative disclosur...,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,1
0,ID_0000900075_2021,900075,2021,item 1a.risk factorsthe following is a descrip...,item 7. management’s discussion and anal...,item 7a.quantitative and qualitative disclosur...,risks inherent business risks uncertainties de...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,1
0,ID_0000105770_2019,105770,2019,item 1a. risk factors the foll...,item 7. management's discussion and analysi...,item 7a. quantitative and qualitative discl...,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,1
0,ID_0001283699_2019,1283699,2019,item 1a. risk factors the foll...,item 7. management's discussion and analysi...,item 7a. quantitative and qualitative discl...,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,1
0,ID_0001137789_2018,1137789,2018,item 1a. risk factors the foll...,item 7. management's discussion and analysi...,item 7a. quantitative and qualitative discl...,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,1


**Write to CSV**

In [None]:
sp500_ml_df.to_csv(os.path.join(DATA_DIR, "ML_Dataset", "SEC_10K_SP500_part.csv"), index = False)

In [66]:
# only NLTKly parsed

sp500_ml_cleaned_df = sp500_ml_df[["ID_CIK_YEAR","CIK","YEAR",
                                   "CLEANED_ITEM_1A","CLEANED_ITEM_7",
                                   "CLEANED_ITEM_7A","SP_500"]]
sp500_ml_cleaned_df.to_csv(os.path.join(DATA_DIR, "ML_Dataset", "SEC_10K_SP500_Cleaned.csv"), index = False)

In [None]:
cleaned_10k_df[["CIK","YEAR","CLEANED_ITEM_1A","CLEANED_ITEM_7","CLEANED_ITEM_7A"]].head(20)

##

In [None]:
sp500_ml_cleaned_df.head(15)

## Assemble All Datasets

In [None]:
TXT_10K_FILES = glob(os.path.join(DATA_DIR, "sec_10k_text_files","*.txt"))

all_ml_dfs = build_nlp_10k_ml_dataset(TXT_10K_FILES, start_idx=0)
# combine all DFs into a single dataframe
all_ml_df = pd.concat(all_ml_dfs, axis = 0)

# add id column
all_ml_df.insert(loc = 0,
            column = "ID_CIK_YEAR",
            value = "ID_" + all_ml_df["CIK"] + "_" + all_ml_df["YEAR"])

# Add target column
all_ml_df["SP_500"] = 0

display(all_ml_df.shape)
display(all_ml_df.head())

In [None]:
# all_ml_df.to_csv(os.path.join(DATA_DIR, "ML_Dataset", "SEC_10K_All.csv"), index = False)

In [None]:
# only NLTKly parsed

all_ml_cleaned_df = all_ml_df[["ID_CIK_YEAR","CIK","YEAR",
                                   "CLEANED_ITEM_1A","CLEANED_ITEM_7",
                                   "CLEANED_ITEM_7A","SP_500"]]
all_ml_cleaned_df.to_csv(os.path.join(DATA_DIR, "ML_Dataset", "SEC_10K_ALL_Cleaned.csv"), index = False)

# Advanced NLTK

In [None]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

#Let's grab word counts like we did prior
stemmer = nltk.stem.SnowballStemmer('english')
risk_sections = [stemmer.stem(risk_section) for risk_section in risk_sections]
vectorizer = CountVectorizer(stop_words='english')
counts = vectorizer.fit_transform(risk_sections)
counts = pd.DataFrame(counts.toarray(),columns=vectorizer.get_feature_names()).transpose()
counts.columns = [2018,2017,2016,2015,2014]
print(counts)

In [None]:
#In this case we don't need to turn it into frequency if we want to also penalize for different length risk sections
print(sum((counts[2018]-counts[2017])**2)**.5)

In [None]:
#We can apply the difference in a horizontal manner
print(counts.diff(axis=1).dropna(axis=1))

In [None]:
#And then get the distance
print((counts.diff(axis=1).dropna(axis=1)**2).sum()**.5)

In [None]:
#Create a function for the distance
def find_distance(risk_sections):
    stemmer = nltk.stem.SnowballStemmer('english')
    risk_sections = [stemmer.stem(risk_section) for risk_section in risk_sections]
    vectorizer = CountVectorizer(stop_words='english')
    counts = vectorizer.fit_transform(risk_sections)
    counts = pd.DataFrame(counts.toarray(),columns=vectorizer.get_feature_names()).transpose()
    counts.columns = [2018,2017,2016,2015,2014]
    return (counts.diff(axis=1).dropna(axis=1)**2).sum()**.5
print(find_distance(risk_sections))

# Applications of TF-IDF

In [None]:
import numpy as np
#One thing we can do is take the log of the number of terms
tf_log = np.log(1 + counts)
print(tf_log)

In [None]:
#Inverse Document Frequency

#First find the number of documents with each term
n = (counts > 0).sum(axis=1)

#Then divide that by total number of documents and take the log of it
idf = np.log(5 / n)
print(idf)

In [None]:
tf_idf = tf_log.multiply(idf, axis=0)
print(tf_idf)

In [None]:
#Now we want to try cosine similarity to compare
#Notice the interesting pattern here of similarity getting smaller and smaller as the years are farther apart
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tf_idf.transpose())
similarity = pd.DataFrame(similarity, index=[2018,2017,2016,2015,2014],columns=[2018,2017,2016,2015,2014])
print(similarity)

In [None]:
import matplotlib.pyplot as plt

for yr in similarity.index:
    similarity.loc[yr].plot(kind='bar', color='blue')
    plt.title("10-K Filing {}".format(yr))
    plt.ylabel("Cosine Similarity")
    plt.show()

In [None]:
yoy_similarity = pd.Series([similarity.loc[x,x+1] for x in [2017,2016,2015,2014]])
yoy_similarity.index = ['{}-{}'.format(x,x+1) for x in [2017,2016,2015,2014]]
print(yoy_similarity)

In [None]:
#Let's see the most impactful words that either came on in the latest or were much more frequent
#As well as the opposite
print(tf_idf.diff(axis=1).dropna(axis=1).iloc[:,0].sort_values().head(10))
print()
print()
print(tf_idf.diff(axis=1).dropna(axis=1).iloc[:,0].sort_values(ascending=False).head(10))

In [None]:
#We see these words were specific only to one filing
print(counts.loc['autonomous'])
print()
print(counts.loc['turkey'])

In [None]:
print(counts.loc['software'])
print()
print(tf_idf.loc['software'])

In [None]:
print(tf_log.loc['software'])
print()
print(idf.loc['software'])

# Risk Analysis

In [None]:
#An application of looking at these things is to see if there are companies that are becoming competitors
#Only in the most recent filing amazon comes up, but Oracle constantly mentions cloud
print(counts.loc["amazon"])
print(counts.loc["cloud"])
print()
print()
#Because cloud is mentioned in every documents it has no importance in the tf_idf
print(tf_idf.loc["amazon"])
print(tf_idf.loc["cloud"])

In [None]:
#Pull the risk sections of Amazon and Oracle
documents_oracle = pull_10K( "0001341439")
documents_amazon = pull_10K("0001018724")

risk_sections_oracle = [pull_risk_section(document) for document in documents_oracle]
risk_sections_amazon = [pull_risk_section(document) for document in documents_amazon]


In [None]:
#Parse each risk section
stemmer = nltk.stem.SnowballStemmer('english')
vectorizer = CountVectorizer(stop_words='english')
counts_oracle = vectorizer.fit_transform(risk_sections_oracle)
counts_oracle = pd.DataFrame(counts_oracle.toarray(),columns=vectorizer.get_feature_names()).transpose()
counts_oracle.columns = [2018,2017,2016,2015,2014]


counts_amazon = vectorizer.fit_transform(risk_sections_amazon)
counts_amazon = pd.DataFrame(counts_amazon.toarray(),columns=vectorizer.get_feature_names()).transpose()
counts_amazon.columns = [2018,2017,2016,2015,2014]


counts_amazon = counts_amazon.stack().reset_index()
counts_oracle = counts_oracle.stack().reset_index()

In [None]:
counts_amazon.columns = ["Word", "Time Period", "Count"]
counts_amazon["Company"] = "Amazon"
counts_oracle.columns = ["Word", "Time Period", "Count"]
counts_oracle["Company"] = "Oracle"
counts = pd.concat([counts_amazon, counts_oracle])
print(counts)

In [None]:
#Now that we are combining the two together, something to consider is how idf will be impacted
#Before, it was a huge difference when Oracle had Amazon added to the risk section
#Now, however, the word will look much less important if IDF is done over the full document
print(counts[counts["Word"] == "amazon"])
print()
print()
#Likewise, Oracle as a word will now be more important because it is not featured in Amazon's 10-K
print(counts[counts["Word"] == "oracle"])

In [None]:
counts = counts.set_index(["Company", "Time Period", "Word"])["Count"].unstack().transpose().fillna(0)
print(counts)

In [None]:
tf_log = np.log(counts + 1)

n = (counts > 0).sum(axis=1)

idf = np.log(len(counts.columns) / n)
tf_idf = tf_log.multiply(idf, axis=0)
print(tf_idf)

In [None]:
#Let's find cosine similarity
similarity = cosine_similarity(tf_idf.transpose())
similarity = pd.DataFrame(similarity, index=tf_idf.columns,columns=tf_idf.columns)
print(similarity)

In [None]:
#And for each time period find the similarity between the two
yearly_sim = pd.Series([similarity.loc[("Amazon",x),("Oracle", x)] for x in [2018,2017,2016,2015,2014]], index=[2018,2017,2016,2015,2014])
print(yearly_sim)

In [None]:
#We can see that amazon talks a lot more about the supply chain buzzwords
diff = tf_idf[("Amazon", 2018)] - tf_idf[("Oracle", 2018)]
print(diff.sort_values(ascending=False).head(10))
print()
print()
print(diff.sort_values().head(10))

In [None]:
#We see that two new words that amazon began using stores, and omnichannel became a big difference
#One limitation is that words are split
#So if amazon mentions whole foods it would be split losing the actual importance of it being a company
#rather than two distinct words
i = diff.sort_values(ascending=False).head(5).index
print(counts.loc[i])

In [None]:
#This analysis can be useful to assess country level threats
#And correlations in the currency markets
print(counts.loc[['yemen', 'turkey', 'china']])

# Sentiment Analysis

In [None]:
#The word list has multiple sheets with tone descriptions for different words
#Something to note is that a word can be in multiple lists!
import pandas as pd
word_list = pd.read_excel("Data/LM Word List.xlsx", sheet_name="Negative",header=None)
print(word_list)

In [None]:
#Create a matrix of word types and the words that match these types
word_list = []
for sentiment_class in ["Negative", "Positive", "Uncertainty", "Litigious",
                       "StrongModal", "WeakModal", "Constraining"]:
    sentiment_list = pd.read_excel("Data/LM Word List.xlsx", sheet_name=sentiment_class,header=None)
    sentiment_list.columns = ["Word"]
    sentiment_list["Word"] = sentiment_list["Word"].str.lower()
    sentiment_list[sentiment_class] = 1
    sentiment_list = sentiment_list.set_index("Word")[sentiment_class]
    word_list.append(sentiment_list)
word_list = pd.concat(word_list, axis=1, sort=True).fillna(0)
print(word_list)

In [None]:
#Examples of words which have multiple groups
print(word_list.loc[word_list.sum(axis=1) > 1])

In [None]:
#Let's reindex by negative words, as well as drop na, and see what negative words are mentioned
#We will look at percent frequency
tf_percent = counts / counts.sum()
negative_words = word_list[word_list["Negative"] == 1].index
negative_frequency = tf_percent.reindex(negative_words).dropna()
print(negative_frequency)

In [None]:
#There seems to be slightly higher negative term frequency for amazon, especially in the last few years
print("Year by Company negative word frequency")
print(negative_frequency.sum())
print()
print("Average company negative frequency")
print(negative_frequency.sum().groupby("Company").mean())

In [None]:
#Let's see what were the most common negative words in Amazon's latest risk section
negative_frequency.sort_values(by=("Amazon", 2018), ascending=False)

In [None]:
#And for all word types.....
l = []
for word_type in word_list.columns:
    word_type_list = word_list[word_list[word_type] == 1].index
    word_type_frequency = tf_percent.reindex(word_type_list).dropna().sum()
    l.append(word_type_frequency)
word_type_frequency = pd.concat(l, axis=1)
word_type_frequency.columns = word_list.columns
print(word_type_frequency)