In [1]:
import pandas as pd
import numpy as np
import os
from pathlib2 import Path
import re
import shutil

# preprocess filings
import string
from nltk import word_tokenize
from nltk.stem import PorterStemmer

# to vectorize filing
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def get_project_dir():
    try:
        project_dir = Path.cwd() / '/' / 'My Drive' / 'Jotham' / 'Personal Docs' / 'ML for finance' / 'SEC Sentiment Analysis' / 'sec-sentiment'
        os.chdir(project_dir)
    except BaseException as e:
        project_dir = Path.cwd() / '/' / 'Volumes' / 'GoogleDrive' / 'My Drive' / 'Jotham' / 'Personal Docs' / 'ML for finance' / 'SEC Sentiment Analysis' / 'sec-sentiment'
        os.chdir(project_dir)
    return project_dir

In [3]:
os.chdir(os.path.join(get_project_dir(), 'sec-filings-downloaded', 'OPTICAL CABLE CORP', 'cleaned_filings'))

In [4]:
os.listdir()

['cleaned_2014-12-19_10-K',
 'cleaned_Q1_2015-03-10_10-Q',
 'cleaned_Q3_2014-09-10_10-Q',
 'cleaned_Q1_2014-03-17_10-Q',
 'cleaned_Q1_2016-03-14_10-Q',
 'cleaned_Q2_2016-06-07_10-Q',
 'cleaned_Q2_2014-06-11_10-Q',
 'cleaned_Q2_2015-06-12_10-Q',
 'cleaned_Q1_2017-03-08_10-Q',
 'cleaned_Q3_2015-09-11_10-Q',
 'cleaned_Q3_2016-09-13_10-Q',
 'cleaned_2016-01-28_10-K',
 'cleaned_Q3_2017-09-12_10-Q',
 'cleaned_2018-12-19_10-K',
 'cleaned_Q2_2017-06-13_10-Q',
 'cleaned_2017-12-20_10-K',
 'cleaned_Q2_2018-06-11_10-Q',
 'cleaned_Q3_2018-09-11_10-Q',
 'cleaned_2016-12-20_10-K',
 'cleaned_Q1_2018-03-13_10-Q']

In [5]:
with open(os.listdir()[0]) as file1:
    data = file1.readline()

In [6]:
with open(os.listdir()[5]) as file2:
    data2 = file2.readline()

In [7]:
with open(os.listdir()[1]) as file3:
    data3 = file3.readline()

In [8]:
data2

' Table Of Contents UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-Q [ X ] QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the quarterly period ended April 30, 2016 OR [ ] TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from _____________ to ______________ Commission file number 0-27022 OPTICAL CABLE CORPORATION (Exact name of registrant as specified in its charter) 5290 Concourse Drive Roanoke, Virginia 24019 (Address of principal executive offices, including zip code) (540) 265-0690 (Registrant s telephone number, including area code) Indicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing req

# Preprocessing

**Import stopwords from LoughranMcDonald Master Dictionary**

In [9]:
def import_master_dict_stopwords(stopwords_file_dir = os.path.join(get_project_dir(), 'master-dict')):
    os.chdir(stopwords_file_dir)
#     stopwords = pd.read_csv('StopWords_Generic.txt', header=None)
    stopwords = pd.read_csv('StopWords_Generic.txt', header=None)[0].tolist()
    stopwords = frozenset([word.lower() for word in stopwords])
    return stopwords

In [10]:
def preprocess_filing(text, stopwords=True, stemming=False):
    
    # remove punctuations
    punctuation_list = set(string.punctuation)
    text = ''.join(word for word in text if word not in punctuation_list)
    
    tokens = word_tokenize(text)
    
    if stopwords:
        stopwords = import_master_dict_stopwords()
        tokens = [word for word in tokens if word not in stopwords]
        tokens = [word.lower() for word in tokens]

    if stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
                
    return tokens

In [28]:
vectorizer = CountVectorizer(tokenizer=preprocess_filing)
X = vectorizer.fit_transform([data2, data3])
# vectorizer.get_feature_names()

In [23]:
def vectorize_and_preprocess_filings(filings_list):
    """vectorizes and preprocesses filings for each company"""
    
    vectorizer = CountVectorizer(tokenizer=preprocess_filing)
    X = vectorizer.fit_transform(filings_list)
    return X

In [24]:
X = vectorize_and_preprocess_filings([data, data2, data3])

In [25]:
X.shape

(3, 2126)

In [26]:
len(X.toarray()[1])

2126

# Calculating Similarity

In [30]:
def calculate_consine_similarity(a, b):
    cos_sim = np.dot(a,b) / ( np.linalg.norm(a) * np.linalg.norm(b) )
    return cos_sim

In [31]:
calculate_consine_similarity(X.toarray()[0] ,X.toarray()[1])

0.8209976571458523

In [32]:
def calculate_similarities(filing_type=['10-K', '10-Q']):
    project_dir = get_project_dir()
    os.chdir(os.path.join(project_dir, 'sec-filings-downloaded'))
    company_dir_list = os.listdir()

    for company in company_dir_list:
        company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
        os.chdir()

In [59]:
project_dir = get_project_dir()
company_dir_list = os.listdir(os.chdir(os.path.join(project_dir, 'sec-filings-downloaded')))

for company in company_dir_list:
    company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
    os.chdir(os.path.join(company_dir, 'cleaned_filings'))
    
    ten_k_dict = {}
    ten_q_dict = {}
    
    for file in os.listdir():
        if file.endswith('10-K'): 
            filing_year = int(file[8:12])
            ten_k_dict[file] = filing_year
            
        if file.endswith('10-Q'):
            filing_quarter = str(file[8:10])
            filing_year = file[11:15]            
            ten_q_dict[file] = str(filing_quarter) + '_' + filing_year
            
    

In [61]:
ten_k_dict

{'cleaned_2014-02-13_10-K': 2014,
 'cleaned_2015-02-17_10-K': 2015,
 'cleaned_2016-02-12_10-K': 2016,
 'cleaned_2017-02-10_10-K': 2017,
 'cleaned_2018-02-12_10-K': 2018,
 'cleaned_2019-02-11_10-K': 2019}

In [65]:
for filing, year in ten_k_dict.items():
    if year > 2016:
        print(filing)

cleaned_2017-02-10_10-K
cleaned_2018-02-12_10-K
cleaned_2019-02-11_10-K


In [67]:
for filing, quarter_year in ten_q_dict.items():
    if quarter_year[0:2] == 'Q1': print(filing)
    

cleaned_Q1_2014-04-25_10-Q
cleaned_Q1_2015-04-29_10-Q
cleaned_Q1_2016-04-22_10-Q
cleaned_Q1_2018-04-27_10-Q
cleaned_Q1_2017-04-21_10-Q


In [68]:
ten_k_dict.fromkeys()

TypeError: fromkeys expected at least 1 arguments, got 0

In [34]:
filing_dict.values()

dict_values([])

In [35]:
X.shape

(2, 1669)

**Financial Data Used**
- Monthly stock returns from Centre for reseach in Security Prices
- Firm's Book value of equity and EPS from Compustat
- Analyst data from Institutional Brokers estimate system
- Sentiment category identifiers from Loughran and McDonald (2011)’s Master Dictionary.

**Similarity measures calculated** <br>
Quarter on quarter similarities between 10-Q and 10-K filings using the following four similarity measures:
- cosine similarity
- Jaccard similarity
- minimum edit distance
- simple similarity

Higher values indicate a higher degree of document similarity across years between the 10-Ks (or 10-Qs), while lower values indicate more changes across documents

**Mechanism Test**<br>
- Measure change of sentiment in document by counting the number of positive words minus number of negative words in the changes between the old and new document, normalized by the size of the changes


- compute the uncertainty and litigious nature of the change by counting the number of words categorized as uncertainty and litigious, respectively, normalized by the size of the changes


- Sentiment category identifiers (e.g., negative, positive, uncertainty, litigious) are taken from Loughran and McDonald (2011)’s Master Dictionary


- parse 10-K/Q documents for mentioning of CEO or CFO turnover and define two indicator variables Change CEO and Change CFO that take the value of 1 if the 10-K/Q documents mention a change in CEO or CFO


- Lastly, we obtain firms’ auditor information from AuditAnalytics

**Linking Similarity / Mechanism Measures to Stock Returns**<br><br>
There were 2 methods used: standard calender time portfolios, and additional determinants of returns were controlled by employing Fama-MacBeth monthly cross-sectional regressions

**Calender Time Portfolio returns**<br><br>
For each of the four similarity measures, quntiles were computed each month based on the prior month's distribution of similarity scores across all stocks. 
- For firms with FYE Dec, for calender quarter Q1, the firm's 10-Q was used, which is generally released in April or May
- For Q2, the 10-Q which is generally released in Jul or Aug was used
- For Q3, the 10-Q which is generally released in Oct or Nov
- For year end results, the 10-K was used, which is generally released in Jan or Feb

Similarity scores are computed relative to the prior year report that lines up in calendar time with the report in question (such that 2004 Q1 10-Qs are compared with 2005 Q1 10-Qs, for example)

Stocks enter the portfolio in the month after the public release of one of their reports, which induces a lag in our portfolio construction. Firms are held in the portfolio for 3 months. Portfolios are rebalanced monthly, and the returns are reported in Table IV.

# Immediate Next Steps

**Calculating document similarity across years**
- Document similarity will be compared YoY (e.g. 2019 10-K will be compared with 2018 10-K)
- First label the 10Q with their respective quarters. Extract year and quarters from the 10Q (done)

Jan - May: Q1
Jun - Aug: Q2
Sep - Dec: Q3

- convert each filing to vec (tokenize each file)
- apply stop words
- lemmatize / stem the filing if necessary