In [1]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,ab,abbott,abc,abdelaziz,abducted,abe,abiding,ability,able,abolish,...,zealots,zeldin,zero,zeros,zone,zones,zoning,zucker,zuckerberg,zupancich
BattleCreekDec19_2019.txt,0,0,0,0,0,0,0,0,4,0,...,0,0,1,1,1,0,0,0,0,0
BemidjiSep18_2020.txt,0,1,2,0,0,0,0,0,3,0,...,0,0,0,0,1,0,0,0,0,1
CharlestonFeb28_2020.txt,0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,1,1,0,0,0,0
CharlotteMar2_2020.txt,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
CincinnatiAug1_2019.txt,0,0,0,0,0,0,1,0,3,0,...,0,0,0,0,0,3,0,0,0,0
ColoradorSpringsFeb20_2020.txt,0,0,0,0,0,2,1,0,3,0,...,0,0,1,0,2,3,0,0,1,0
DallasOct17_2019.txt,0,0,0,0,0,6,1,0,4,1,...,0,0,1,0,0,0,0,0,0,0
DesMoinesJan30_2020.txt,0,0,0,0,0,2,0,2,7,1,...,0,0,0,0,0,4,0,0,0,0
FayettevilleSep19_2020.txt,0,0,0,0,0,1,0,0,3,0,...,0,0,1,0,0,0,0,0,0,0
FayettevilleSep9_2019.txt,0,0,0,0,0,0,1,0,4,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [3]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,BattleCreekDec19_2019.txt,BemidjiSep18_2020.txt,CharlestonFeb28_2020.txt,CharlotteMar2_2020.txt,CincinnatiAug1_2019.txt,ColoradorSpringsFeb20_2020.txt,DallasOct17_2019.txt,DesMoinesJan30_2020.txt,FayettevilleSep19_2020.txt,FayettevilleSep9_2019.txt,...,OhioSep21_2020.txt,PhoenixFeb19_2020.txt,PittsburghSep22_2020.txt,TexasSep23_2019.txt,ToledoJan9_2020.txt,TulsaJun20_2020.txt,TupeloNov1_2019.txt,WildwoodJan28_2020.txt,Winston-SalemSep8_2020.txt,YumaAug18_2020.txt
ab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abbott,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
abc,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdelaziz,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abducted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.005*"thing" + 0.004*"remember" + 0.004*"joe" + 0.004*"china" + 0.004*"guy" + 0.004*"didnt" + 0.004*"big" + 0.004*"job" + 0.004*"really" + 0.004*"tell"'),
 (1,
  '0.004*"democrats" + 0.004*"didnt" + 0.004*"really" + 0.004*"big" + 0.004*"state" + 0.004*"tell" + 0.004*"thing" + 0.004*"job" + 0.004*"trump" + 0.004*"believe"')]

In [7]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.005*"thing" + 0.005*"didnt" + 0.004*"really" + 0.004*"tell" + 0.004*"big" + 0.004*"remember" + 0.004*"job" + 0.004*"china" + 0.004*"guy" + 0.004*"okay"'),
 (1,
  '0.005*"job" + 0.004*"state" + 0.004*"democrats" + 0.004*"history" + 0.004*"incredible" + 0.004*"really" + 0.004*"believe" + 0.004*"carolina" + 0.004*"world" + 0.004*"states"'),
 (2,
  '0.005*"big" + 0.005*"didnt" + 0.005*"trump" + 0.004*"guy" + 0.004*"democrats" + 0.004*"remember" + 0.004*"thing" + 0.003*"tell" + 0.003*"beautiful" + 0.003*"won"')]

In [8]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.005*"big" + 0.005*"didnt" + 0.004*"tell" + 0.004*"really" + 0.004*"democrats" + 0.004*"trump" + 0.004*"state" + 0.004*"billion" + 0.003*"year" + 0.003*"believe"'),
 (1,
  '0.005*"thing" + 0.005*"remember" + 0.004*"really" + 0.004*"joe" + 0.004*"guy" + 0.004*"win" + 0.004*"job" + 0.004*"big" + 0.004*"didnt" + 0.004*"tell"'),
 (2,
  '0.005*"thing" + 0.005*"didnt" + 0.004*"okay" + 0.004*"remember" + 0.004*"guy" + 0.004*"michigan" + 0.004*"tell" + 0.004*"job" + 0.004*"deal" + 0.004*"big"'),
 (3,
  '0.005*"state" + 0.005*"democrats" + 0.004*"job" + 0.004*"history" + 0.004*"believe" + 0.004*"states" + 0.004*"incredible" + 0.004*"really" + 0.004*"party" + 0.004*"world"')]

In [9]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [10]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
BattleCreekDec19_2019.txt,thank you thank you thank you to vice presiden...
BemidjiSep18_2020.txt,theres a lot of people thats great thank you v...
CharlestonFeb28_2020.txt,thank you thank you thank you all i can say is...
CharlotteMar2_2020.txt,i want to thank you very much north carolina t...
CincinnatiAug1_2019.txt,thank you all thank you very much thank you to...
ColoradorSpringsFeb20_2020.txt,hello colorado we love colorado most beautiful...
DallasOct17_2019.txt,thank you thank you very much hello dallas its...
DesMoinesJan30_2020.txt,i worked so hard for this state i worked so ha...
FayettevilleSep19_2020.txt,what a crowd what a crowd get those people ove...
FayettevilleSep9_2019.txt,thank you everybody thank you and vice presid...


In [11]:
# Apply the nouns function to the transcripts to filter only on nouns
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

[nltk_data] Downloading package punkt to /Users/kritink/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kritink/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,transcript
BattleCreekDec19_2019.txt,thank vice president pence guy job christmas m...
BemidjiSep18_2020.txt,lot people thats thank thats group people noti...
CharlestonFeb28_2020.txt,thank i news charleston im state carolina thou...
CharlotteMar2_2020.txt,i thank im city charlotte way convention i gue...
CincinnatiAug1_2019.txt,thank vice president pence cincinnati cincinna...
ColoradorSpringsFeb20_2020.txt,colorado place im colorado springs lot time ye...
DallasOct17_2019.txt,thank hello heart texas louie vuitton plant im...
DesMoinesJan30_2020.txt,i state i trade deals japan way lets japan chi...
FayettevilleSep19_2020.txt,crowd get people crowd country left crazies ca...
FayettevilleSep9_2019.txt,thank vice president pence thank hello fayette...


In [12]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,abbott,abc,abdelaziz,abe,ability,abolish,abortion,abraham,abrams,absentee,...,yuma,zealots,zeldin,zero,zeros,zone,zones,zoning,zucker,zupancich
BattleCreekDec19_2019.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,0,1,1,0,0,0,0,0
BemidjiSep18_2020.txt,1,2,0,0,0,0,0,4,0,0,...,0,0,0,0,0,1,0,0,0,1
CharlestonFeb28_2020.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,0,1,0,1,1,0,0,0
CharlotteMar2_2020.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CincinnatiAug1_2019.txt,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,3,0,0,0
ColoradorSpringsFeb20_2020.txt,0,0,0,2,0,0,3,0,0,0,...,0,0,0,0,0,1,3,0,0,0
DallasOct17_2019.txt,0,0,0,5,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
DesMoinesJan30_2020.txt,0,0,0,1,2,0,3,2,0,0,...,0,0,0,0,0,0,4,0,0,0
FayettevilleSep19_2020.txt,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
FayettevilleSep9_2019.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [14]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.017*"years" + 0.014*"country" + 0.011*"lot" + 0.011*"way" + 0.010*"hes" + 0.010*"president" + 0.008*"theyre" + 0.008*"thing" + 0.008*"job" + 0.007*"world"'),
 (1,
  '0.012*"years" + 0.011*"president" + 0.011*"lot" + 0.010*"country" + 0.010*"hes" + 0.008*"guy" + 0.008*"way" + 0.007*"state" + 0.007*"thing" + 0.007*"theyre"')]

In [15]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.017*"years" + 0.013*"country" + 0.012*"lot" + 0.011*"way" + 0.010*"president" + 0.009*"theyre" + 0.008*"thing" + 0.008*"hes" + 0.007*"democrats" + 0.007*"guy"'),
 (1,
  '0.009*"country" + 0.006*"border" + 0.006*"states" + 0.006*"years" + 0.005*"minister" + 0.005*"thank" + 0.005*"india" + 0.005*"michigan" + 0.004*"lot" + 0.004*"world"'),
 (2,
  '0.015*"years" + 0.014*"country" + 0.012*"hes" + 0.011*"president" + 0.011*"lot" + 0.010*"way" + 0.008*"job" + 0.008*"theyre" + 0.008*"thing" + 0.007*"state"')]

In [16]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.016*"years" + 0.012*"country" + 0.012*"hes" + 0.012*"lot" + 0.011*"way" + 0.009*"president" + 0.008*"thing" + 0.008*"theyre" + 0.008*"job" + 0.007*"guy"'),
 (1,
  '0.012*"years" + 0.012*"hes" + 0.011*"lot" + 0.011*"guy" + 0.009*"thing" + 0.009*"way" + 0.008*"theyre" + 0.007*"joe" + 0.007*"minnesota" + 0.006*"state"'),
 (2,
  '0.016*"years" + 0.016*"country" + 0.013*"president" + 0.010*"way" + 0.010*"lot" + 0.009*"democrats" + 0.008*"theyre" + 0.008*"state" + 0.008*"hes" + 0.007*"america"'),
 (3,
  '0.013*"years" + 0.011*"country" + 0.010*"party" + 0.009*"theyre" + 0.007*"wall" + 0.007*"democrats" + 0.006*"lot" + 0.006*"jersey" + 0.005*"way" + 0.005*"farmers"')]

In [17]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [18]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
BattleCreekDec19_2019.txt,thank vice president pence good guy great job ...
BemidjiSep18_2020.txt,lot people thats great thank much thats big gr...
CharlestonFeb28_2020.txt,thank i fake news hello charleston im great st...
CharlotteMar2_2020.txt,i much carolina thank much im great city charl...
CincinnatiAug1_2019.txt,thank vice president pence cincinnati cincinna...
ColoradorSpringsFeb20_2020.txt,hello colorado beautiful place im colorado spr...
DallasOct17_2019.txt,thank much hello great deep heart texas beauti...
DesMoinesJan30_2020.txt,i hard state i hard greatest trade deals japan...
FayettevilleSep19_2020.txt,crowd crowd get people big crowd big country r...
FayettevilleSep9_2019.txt,thank vice president pence thank incredible he...


In [19]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,abbott,abc,abdelaziz,abe,ability,abolish,abortion,abraham,abrams,absentee,...,zealots,zeldin,zero,zeros,zone,zones,zoning,zucker,zuckerberg,zupancich
BattleCreekDec19_2019.txt,0,0,0,0,0,0,3,2,0,0,...,0,0,1,1,0,0,0,0,0,0
BemidjiSep18_2020.txt,1,2,0,0,0,0,0,4,0,0,...,0,0,0,0,1,0,0,0,0,1
CharlestonFeb28_2020.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,1,0,1,1,0,0,0,0
CharlotteMar2_2020.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CincinnatiAug1_2019.txt,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,3,0,0,0,0
ColoradorSpringsFeb20_2020.txt,0,0,0,2,0,0,3,1,0,0,...,0,0,0,0,1,3,0,0,1,0
DallasOct17_2019.txt,0,0,0,6,0,0,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
DesMoinesJan30_2020.txt,0,0,0,1,2,0,3,2,0,0,...,0,0,0,0,0,4,0,0,0,0
FayettevilleSep19_2020.txt,0,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
FayettevilleSep9_2019.txt,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [21]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.005*"carolina" + 0.004*"farmers" + 0.004*"michigan" + 0.004*"north" + 0.003*"billions" + 0.003*"texas" + 0.003*"ohio" + 0.003*"bernie" + 0.003*"pennsylvania" + 0.003*"mexico"'),
 (1,
  '0.004*"general" + 0.003*"minnesota" + 0.003*"russia" + 0.003*"police" + 0.002*"stuff" + 0.002*"air" + 0.002*"business" + 0.002*"texas" + 0.002*"plane" + 0.002*"mike"')]

In [22]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.005*"north" + 0.004*"carolina" + 0.004*"police" + 0.003*"bernie" + 0.003*"ohio" + 0.003*"texas" + 0.003*"york" + 0.003*"order" + 0.003*"general" + 0.003*"russia"'),
 (1,
  '0.005*"unemployment" + 0.004*"farmers" + 0.004*"carolina" + 0.004*"doctor" + 0.003*"iowa" + 0.003*"bernie" + 0.003*"hampshire" + 0.003*"south" + 0.003*"swamp" + 0.002*"child"'),
 (2,
  '0.009*"michigan" + 0.005*"texas" + 0.005*"mexico" + 0.004*"billions" + 0.004*"farmers" + 0.003*"weeks" + 0.002*"media" + 0.002*"water" + 0.002*"john" + 0.002*"hillary"')]

In [23]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.005*"michigan" + 0.004*"carolina" + 0.004*"farmers" + 0.003*"south" + 0.003*"iowa" + 0.003*"unemployment" + 0.003*"hampshire" + 0.003*"billions" + 0.002*"doctor" + 0.002*"nations"'),
 (1,
  '0.004*"police" + 0.003*"north" + 0.003*"texas" + 0.003*"bernie" + 0.003*"york" + 0.003*"order" + 0.003*"carolina" + 0.003*"russia" + 0.003*"business" + 0.003*"amendment"'),
 (2,
  '0.006*"farmers" + 0.005*"ohio" + 0.005*"billions" + 0.005*"texas" + 0.004*"kentucky" + 0.004*"tomorrow" + 0.004*"north" + 0.004*"doctor" + 0.004*"carolina" + 0.003*"dan"'),
 (3,
  '0.006*"mexico" + 0.006*"michigan" + 0.004*"unemployment" + 0.004*"jersey" + 0.004*"north" + 0.004*"ice" + 0.004*"carolina" + 0.004*"wisconsin" + 0.003*"pennsylvania" + 0.003*"impeachment"')]

In [24]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.006*"carolina" + 0.005*"unemployment" + 0.004*"north" + 0.004*"farmers" + 0.004*"doctor" + 0.003*"bernie" + 0.003*"iowa" + 0.003*"hampshire" + 0.003*"south" + 0.003*"immigration"'),
 (1,
  '0.005*"police" + 0.005*"texas" + 0.004*"north" + 0.003*"carolina" + 0.003*"york" + 0.003*"russia" + 0.003*"order" + 0.003*"pennsylvania" + 0.003*"general" + 0.003*"bernie"'),
 (2,
  '0.005*"michigan" + 0.005*"ohio" + 0.004*"billions" + 0.004*"farmers" + 0.003*"hunter" + 0.003*"steel" + 0.003*"theyd" + 0.003*"weeks" + 0.003*"ones" + 0.003*"nancy"'),
 (3,
  '0.009*"mexico" + 0.007*"michigan" + 0.006*"jersey" + 0.005*"pennsylvania" + 0.005*"impeachment" + 0.005*"elite" + 0.004*"ice" + 0.003*"billions" + 0.003*"unemployment" + 0.003*"russia"')]

In [81]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list([a for [(a,b)] in corpus_transformed], data_dtmna.index)

ValueError: too many values to unpack (expected 1)

In [88]:
for (a,b) in zip(corpus_transformed, data_dtmna.index):
    print(a,b)
    

[(2, 0.99960667)] BattleCreekDec19_2019.txt
[(1, 0.9996191)] BemidjiSep18_2020.txt
[(0, 0.9993402)] CharlestonFeb28_2020.txt
[(0, 0.9991399)] CharlotteMar2_2020.txt
[(0, 0.26694724), (2, 0.73253316)] CincinnatiAug1_2019.txt
[(0, 0.99948585)] ColoradorSpringsFeb20_2020.txt
[(0, 0.26673418), (1, 0.43329626), (2, 0.29976028)] DallasOct17_2019.txt
[(0, 0.99945295)] DesMoinesJan30_2020.txt
[(1, 0.9995903)] FayettevilleSep19_2020.txt
[(0, 0.99927616)] FayettevilleSep9_2019.txt
[(1, 0.7307131), (3, 0.26887518)] FreelandSep10_2020.txt
[(0, 0.9993692)] GreenvilleJul17_2019.txt
[(1, 0.7245093), (3, 0.27503926)] HendersonSep13_2020.txt
[(3, 0.9993856)] HersheyDec10_2019.txt
[(0, 0.9995606)] LasVegasFeb21_2020.txt
[(1, 0.9995135)] LatrobeSep3_2020.txt
[(0, 0.7720773), (2, 0.22745883)] LexingtonNov4_2019.txt
[(0, 0.9993602)] MilwaukeeJan14_2020.txt
[(1, 0.9995331)] MindenSep12_2020.txt
[(2, 0.9994647)] MinneapolisOct10_2019.txt
[(1, 0.9995463)] MosineeSep17_2020.txt
[(0, 0.96744514), (1, 0.03210463