In [2]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from nltk.util import bigrams, trigrams, ngrams

from collections import Counter

In [3]:
text_file = open('resources/full_text/AAPL_0000320193_10Q_20171230_AllItems_excerpt.txt', encoding='utf-8').read()
condensed_text = text_file[0:3000]

In [4]:
tokenized = re.findall('[a-zA-Z]+', condensed_text)
print('Total tokenized words:', len(tokenized))

stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized]
print('Total stemmed words:', len(stemmed))

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokenized]
print('Total lemmatized words:', len(lemmatized))

stop_words = stopwords.words('english')
new_lemmatized = [token for token in lemmatized if token.lower() not in stop_words]
print('Total tokenized (w/out stop words) words:', len(new_lemmatized))

bag_of_words = Counter(new_lemmatized)
#print(bag_of_words)

print("Stemmed text:")
print(stemmed)
print("\nLemmatized text:")
print(lemmatized)
print("\nLemmatized (without stop words) text:")
print(new_lemmatized)


Total tokenized words: 408
Total stemmed words: 408
Total lemmatized words: 408
Total tokenized (w/out stop words) words: 284
Stemmed text:
['document', 'unit', 'state', 'secur', 'and', 'exchang', 'commiss', 'for', 'the', 'quarterli', 'period', 'end', 'decemb', 'or', 'for', 'the', 'transit', 'period', 'from', 'to', 'appl', 'inc', 'indic', 'by', 'check', 'mark', 'whether', 'the', 'registr', 'ha', 'file', 'all', 'report', 'requir', 'to', 'be', 'file', 'by', 'section', 'or', 'd', 'of', 'the', 'secur', 'exchang', 'act', 'of', 'dure', 'the', 'preced', 'month', 'or', 'for', 'such', 'shorter', 'period', 'that', 'the', 'registr', 'wa', 'requir', 'to', 'file', 'such', 'report', 'and', 'ha', 'been', 'subject', 'to', 'such', 'file', 'requir', 'for', 'the', 'past', 'day', 'indic', 'by', 'check', 'mark', 'whether', 'the', 'registr', 'ha', 'submit', 'electron', 'and', 'post', 'on', 'it', 'corpor', 'web', 'site', 'if', 'ani', 'everi', 'interact', 'data', 'file', 'requir', 'to', 'be', 'submit', 'and',

In [5]:
cleaned = re.sub('\W+', ' ', condensed_text).lower()
tokenized = word_tokenize(cleaned)

bigrams = bigrams(tokenized)
bigrams_frequency = Counter(bigrams)

# Change the n value to 3:
trigrams = trigrams(tokenized)
trigrams_frequency = Counter(trigrams)

# Change the n value to a number greater than 3:
ngrams = ngrams(tokenized, 5)
ngrams_frequency = Counter(ngrams)

# Change the n value to a number greater than 3:
print("Looking Glass Bigrams:")
print(bigrams_frequency.most_common(10))

print("\nLooking Glass Trigrams:")
print(trigrams_frequency.most_common(10))

print("\nLooking Glass n-grams:")
print(ngrams_frequency.most_common(10))


Looking Glass Bigrams:
[(('condensed', 'consolidated'), 9), (('apple', 'inc'), 8), (('the', 'registrant'), 6), (('financial', 'statements'), 6), (('accelerated', 'filer'), 5), (('statements', 'apple'), 5), (('notes', 'to'), 5), (('to', 'condensed'), 5), (('consolidated', 'financial'), 5), (('for', 'the'), 4)]

Looking Glass Trigrams:
[(('financial', 'statements', 'apple'), 5), (('statements', 'apple', 'inc'), 5), (('notes', 'to', 'condensed'), 5), (('to', 'condensed', 'consolidated'), 5), (('condensed', 'consolidated', 'financial'), 5), (('consolidated', 'financial', 'statements'), 5), (('indicate', 'by', 'check'), 4), (('by', 'check', 'mark'), 4), (('check', 'mark', 'whether'), 4), (('mark', 'whether', 'the'), 4)]

Looking Glass n-grams:
[(('notes', 'to', 'condensed', 'consolidated', 'financial'), 5), (('to', 'condensed', 'consolidated', 'financial', 'statements'), 5), (('indicate', 'by', 'check', 'mark', 'whether'), 4), (('by', 'check', 'mark', 'whether', 'the'), 4), (('check', 'mark