In [32]:
from glob import glob
from collections import defaultdict, Counter

In [33]:
# path to the dataset
# update with a path pointing to the dataset on your own machine!
path = '/Users/hsh28/data/trump_stumps/'

In [34]:
# gathering all text filenames in the directory
fps = glob(path + '*.txt')
fps[:5]

['/Users/hsh28/data/trump_stumps/FayettevilleSep9_2019.txt',
 '/Users/hsh28/data/trump_stumps/TupeloNov1_2019.txt',
 '/Users/hsh28/data/trump_stumps/NewHampshireAug15_2019.txt',
 '/Users/hsh28/data/trump_stumps/HendersonSep13_2020.txt',
 '/Users/hsh28/data/trump_stumps/OhioSep21_2020.txt']

In [35]:
# where we'll store all the data of our speeches
data = {}
for f in fps:
    # name will be the direct filename, without prefix path
    name = f.split('/')[-1]
    
    # read the lines of the speech
    with open(f) as ff:
        lines = ff.readlines()
    
    # Understanding how many lines each speech has
    # Hint: It should just be 1
    print(f'{name} has {len(lines)} lines!')
        
    data[name] = lines

FayettevilleSep9_2019.txt has 1 lines!
TupeloNov1_2019.txt has 1 lines!
NewHampshireAug15_2019.txt has 1 lines!
HendersonSep13_2020.txt has 1 lines!
OhioSep21_2020.txt has 1 lines!
PhoenixFeb19_2020.txt has 1 lines!
BattleCreekDec19_2019.txt has 1 lines!
PittsburghSep22_2020.txt has 1 lines!
TexasSep23_2019.txt has 1 lines!
ColoradorSpringsFeb20_2020.txt has 1 lines!
LatrobeSep3_2020.txt has 1 lines!
DallasOct17_2019.txt has 1 lines!
DesMoinesJan30_2020.txt has 1 lines!
MinneapolisOct10_2019.txt has 1 lines!
YumaAug18_2020.txt has 1 lines!
ToledoJan9_2020.txt has 1 lines!
CharlotteMar2_2020.txt has 1 lines!
Winston-SalemSep8_2020.txt has 1 lines!
TulsaJun20_2020.txt has 1 lines!
NewMexicoSep16_2019.txt has 1 lines!
HersheyDec10_2019.txt has 1 lines!
NewHampshireFeb10_2020.txt has 1 lines!
LasVegasFeb21_2020.txt has 1 lines!
NewHampshireAug28_2020.txt has 1 lines!
MindenSep12_2020.txt has 1 lines!
FreelandSep10_2020.txt has 1 lines!
CharlestonFeb28_2020.txt has 1 lines!
MosineeSep17_202

In [36]:
# where we'll store all the data of our speeches
# a revision of above since we know that each file is 1 line
data = {}
for f in fps:
    # name will be the direct filename, without prefix path
    name = f.split('/')[-1]
    
    # read the lines of the speech
    with open(f) as ff:
        text = ff.readlines()[0]
    
    # Print the number of characters in each speech!
    print(f'{name:30}:\t{len(text)} chars')
        
    data[name] = text

FayettevilleSep9_2019.txt     :	50472 chars
TupeloNov1_2019.txt           :	51128 chars
NewHampshireAug15_2019.txt    :	55388 chars
HendersonSep13_2020.txt       :	49082 chars
OhioSep21_2020.txt            :	58316 chars
PhoenixFeb19_2020.txt         :	52640 chars
BattleCreekDec19_2019.txt     :	95760 chars
PittsburghSep22_2020.txt      :	65020 chars
TexasSep23_2019.txt           :	14610 chars
ColoradorSpringsFeb20_2020.txt:	63122 chars
LatrobeSep3_2020.txt          :	67534 chars
DallasOct17_2019.txt          :	57081 chars
DesMoinesJan30_2020.txt       :	64514 chars
MinneapolisOct10_2019.txt     :	63409 chars
YumaAug18_2020.txt            :	34470 chars
ToledoJan9_2020.txt           :	58727 chars
CharlotteMar2_2020.txt        :	37173 chars
Winston-SalemSep8_2020.txt    :	60811 chars
TulsaJun20_2020.txt           :	61155 chars
NewMexicoSep16_2019.txt       :	63291 chars
HersheyDec10_2019.txt         :	54664 chars
NewHampshireFeb10_2020.txt    :	36830 chars
LasVegasFeb21_2020.txt        :	

In [41]:
def tokenize(text):
    """
    Our simple function that takes a text and splits 
    it into token entities!
    """
    return [t.lower() for t in text.split()]

In [44]:
# What are Trump's most common words/tokens in these speeches? 
cnt = Counter()
for name, text in data.items():
    cnt += Counter(tokenize(text))

# as one would guess, these are pretty much just stop words!
cnt.most_common(20)

[('the', 14103),
 ('and', 11188),
 ('to', 9251),
 ('a', 7872),
 ('i', 7421),
 ('of', 6770),
 ('we', 6397),
 ('you', 6361),
 ('they', 5491),
 ('in', 4329),
 ('that', 4106),
 ('have', 3770),
 ('it', 3426),
 ('but', 2932),
 ('is', 2582),
 ('for', 2494),
 ('he', 2472),
 ('was', 2436),
 ("it's", 2412),
 ('going', 2260)]

In [50]:
# Are there words only mentioned in some speeches but never in any others?
word_forms = defaultdict(set)
for name, text in data.items():
    word_forms[name] = set(tokenize(text))

# A random sampling of some of the words in a speech
print(list(word_forms['FayettevilleSep9_2019.txt'])[:10])

unique_forms = defaultdict(set)
for speech_x, forms_x in word_forms.items():
    unique_forms[speech_x] = forms_x
    for speech_y, forms_y in word_forms.items():
        if speech_x == speech_y:
            continue
        
        unique_forms[speech_x] -= forms_y

['count.', 'important.', 'nothing', 'situation,', 'allies', 'said,', 'future,', '45', 'path,', 'compare']


In [52]:
# preview some of the unique terms!
for speech, forms in unique_forms.items():
    print(speech)
    print(f'Found {len(forms)} unique words.')
    print(f'Previewing upto 7 random ones: {list(forms)[:7]})')
    print('-' * 25)
    print()

FayettevilleSep9_2019.txt
Found 189 unique words.
Previewing upto 7 random ones: ['formation', '2026.', 'fixed,', 'bishop?', 'yep,', 'cutters?"', 'approval,'])
-------------------------

TupeloNov1_2019.txt
Found 251 unique words.
Previewing upto 7 random ones: ['nature.', 'museum.', 'elvis', 'misdeeds,', 'phil.', 'pardon.', 'a-flailing?'])
-------------------------

NewHampshireAug15_2019.txt
Found 216 unique words.
Previewing upto 7 random ones: ['weakened', 'time?"', 'tubes,', 'elected."', 'europe.".', 'rerun', 'infested."'])
-------------------------

HendersonSep13_2020.txt
Found 215 unique words.
Previewing upto 7 random ones: ['manifesto?', 'make…"', 'record-setting', 'pennant.', 'bars.', 'art.', 'prisons.'])
-------------------------

OhioSep21_2020.txt
Found 227 unique words.
Previewing upto 7 random ones: ['constitutionally', '4th', 'error.', 'kamilla?', '"no."', 'insurmountable', 'oakland.'])
-------------------------

PhoenixFeb19_2020.txt
Found 235 unique words.
Previewing