In [22]:
import glob, re
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *

import string
stopWords = set(stopwords.words('english') ) 

# files holds a list with elements each being a path to a html file
# assumes the folder is in the same folder as this jupyter notebook
files = glob.glob('letters_text/*.txt')

allFiles= []
for file in files[0:20]:
    # read file
    with open( file, encoding='utf-8') as f:
        content = f.read()
    file_tokens = [x for x in word_tokenize(content) if x.lower() not in stopWords and x not in string.punctuation]
    # now we can use nltk functions on the text
    fdist = FreqDist(file_tokens)
    print(fdist)
    print ('most common words in this letter', fdist.most_common(15) )
    # add it to the list of all files
    allFiles.append( file_tokens )

<FreqDist with 899 samples and 4215 outcomes>
most common words in this letter [('tax', 208), ('deferred', 137), ('assets', 100), ('TRA', 77), ('income', 67), ('million', 57), ('LP', 53), ('2016', 51), ('future', 44), ('amount', 43), ('taxable', 38), ('valuation', 38), ('allowance', 37), ('’', 35), ('Units', 33)]
<FreqDist with 377 samples and 821 outcomes>
most common words in this letter [('tax', 29), ('December', 16), ('31', 15), ('2015', 13), ('deferred', 11), ('taxable', 10), ('recognized', 10), ('year', 9), ('asset', 9), ('2016', 8), ('ended', 8), ('differences', 8), ('’', 7), ('future', 7), ('profits', 7)]
<FreqDist with 194 samples and 289 outcomes>
most common words in this letter [('tax', 10), ('Company', 7), ('’', 6), ('2017', 4), ('1', 4), ('response', 4), ('benefits', 4), ('Form', 3), ('10-K', 3), ('year', 3), ('ended', 3), ('December', 3), ('31', 3), ('2016', 3), ('letter', 3)]
<FreqDist with 310 samples and 573 outcomes>
most common words in this letter [('Company', 12),

In [15]:
allFiles[1]

['CORRESP',
 'September',
 '20',
 '2016',
 'VIA',
 'EDGAR',
 'Joel',
 'Parker',
 'Senior',
 'Assistant',
 'Chief',
 'Accountant',
 'Offices',
 'Beverages',
 'Apparel',
 'Mining',
 'Securities',
 'Exchange',
 'Commission',
 '100',
 'F',
 'Street',
 'N.E',
 'Washington',
 'D.C.',
 '20549',
 'Turquoise',
 'Hill',
 'Resources',
 'Ltd.',
 'Form',
 '40-F',
 'fiscal',
 'year',
 'ended',
 'December',
 '31',
 '2015',
 'Filed',
 'March',
 '17',
 '2016',
 'Response',
 'Dated',
 'July',
 '28',
 '2016',
 'File',
 '001-32403',
 'Dear',
 'Mr.',
 'Parker',
 'Turquoise',
 'Hill',
 'Resources',
 'Ltd.',
 '“',
 'Company',
 '”',
 'hereby',
 'acknowledges',
 'receipt',
 'comment',
 'letter',
 'dated',
 'August',
 '24',
 '2016',
 '“',
 'Comment',
 'Letter',
 '”',
 'containing',
 'comments',
 'respect',
 'July',
 '28',
 '2016',
 'response',
 'comment',
 'letter',
 'dated',
 'July',
 '1',
 '2016',
 'staff',
 '“',
 'Staff',
 '”',
 'Securities',
 'Exchange',
 'Commission',
 'concerning',
 'captioned',
 'Form',


In [8]:
allFiles[1][0:10]

['CORRESP',
 'September',
 '20',
 '2016',
 'VIA',
 'EDGAR',
 'Joel',
 'Parker',
 'Senior',
 'Assistant']

In [19]:
file_tokens

['Letterhead',
 'Wachtell',
 'Lipton',
 'Rosen',
 'Katz',
 'September',
 '13',
 '2016',
 'VIA',
 'HAND',
 'DELIVERY',
 'EDGAR',
 'Suzanne',
 'Hayes',
 'Assistant',
 'Director',
 'Office',
 'Healthcare',
 'Insurance',
 'Division',
 'Corporation',
 'Finance',
 'U.S.',
 'Securities',
 'Exchange',
 'Commission',
 '100',
 'F',
 'Street',
 'NE',
 'Washington',
 'D.C.',
 '20549',
 'Abbott',
 'Laboratories',
 'Amendment',
 '1',
 'Registration',
 'Statement',
 'Form',
 'S-4',
 'Filed',
 'August',
 '9',
 '2016',
 'File',
 '333-212002',
 'Dear',
 'Ms.',
 'Hayes',
 'behalf',
 'client',
 'Abbott',
 'Laboratories',
 '“',
 'Abbott',
 '”',
 '“',
 'Company',
 '”',
 'set',
 'forth',
 'responses',
 'Company',
 'comments',
 'Staff',
 '“',
 'Staff',
 '”',
 'Division',
 'Corporation',
 'Finance',
 'U.S.',
 'Securities',
 'Exchange',
 'Commission',
 '“',
 'Commission',
 '”',
 'set',
 'forth',
 'letter',
 'dated',
 'August',
 '9',
 '2016',
 'regarding',
 'Amendment',
 '1',
 'Company',
 '’',
 'Registration',
 

In [20]:
# example collocations (for one document)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(file_tokens)
sorted(finder.nbest(bigram_measures.raw_freq, 10))

[('9', '2016'),
 ('Abbott', 'Laboratories'),
 ('Amendment', '2'),
 ('August', '9'),
 ('Exchange', 'Commission'),
 ('Guggenheim', 'Securities'),
 ('Registration', 'Statement'),
 ('Securities', 'Exchange'),
 ('Staff', '’'),
 ('Statement', 'revised')]

In [23]:
# five most frequent collocations (for each letter)
bigram_measures = nltk.collocations.BigramAssocMeasures()

for f in allFiles:
    # set finder
    finder = BigramCollocationFinder.from_words(f)
    # get 5 most frequent
    most = sorted(finder.nbest(bigram_measures.raw_freq, 5))
    print(most)
    

[('LP', 'Units'), ('deferred', 'tax'), ('tax', 'assets'), ('taxable', 'income'), ('valuation', 'allowance')]
[('31', '2015'), ('December', '31'), ('deferred', 'tax'), ('ended', 'December'), ('tax', 'asset')]
[('31', '2016'), ('Company', '’'), ('December', '31'), ('Form', '10-K'), ('tax', 'benefits')]
[('9', '2016'), ('Amendment', '2'), ('August', '9'), ('Guggenheim', 'Securities'), ('Registration', 'Statement')]
[('Amendment', '3'), ('CONSOL', 'Mining'), ('Company', 'respectfully'), ('Staff', '’'), ('disclosure', 'page')]
[("''", '100'), ("''", 'border=0'), ('100', "''"), ('style=', "''"), ('width=', "''")]
[('30', '2015'), ('Exchange', 'Commission'), ('Form', '10-K'), ('November', '30'), ('Securities', 'Exchange')]
[('December', '31'), ('Senior', 'Notes'), ('cash', 'cash'), ('cash', 'equivalents'), ('foreign', 'subsidiaries')]
[('January', '28'), ('attributable', 'countries'), ('countries', 'outside'), ('long-lived', 'assets'), ('outside', 'U.S.')]
[('Fund', '’'), ('Investment', 'Stra