In [25]:
from collections import Counter
import pickle
from string import punctuation as punct

import pandas as pd
import spacy
import numpy as np

In [2]:
nlp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [3]:
#create one big Doc object, pickle it for later use
tweets = open('../data/tweets/tweet_text.txt', 'r').read()
nlp.max_length = 2000000
doc = nlp(tweets)

In [4]:
# can comment out with statement after the pkl is written
# with open('../data/tweets/tweets_nlp_doc.pkl', 'wb') as file:
#     pickle.dump(doc, file)

doc = pickle.load(open('../data/tweets/tweets_nlp_doc.pkl', 'rb'))

In [5]:
# number of tokens in the entire tweet corpus
len(doc)

311160

In [81]:
# all tokens that arent stop words or punctuations
misc = ['\n', '\n  ', 'amp', ' ', '&amp;']

words = [token.text.lower().strip().strip(punct) for token in doc if token.is_stop != True 
         and token.is_punct != True 
         and token.text not in misc]

# noun tokens that arent stop words or punctuations
nouns = [token.text.lower().strip().strip(punct) for token in doc if token.is_stop != True 
         and token.is_punct != True 
         and token.pos_ == "NOUN"]

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(5)
print(common_words)
print('\n')

# five most common noun tokens
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common(5)
print(common_nouns)

[('great', 2142), ('people', 1085), ('president', 874), ('country', 801), ('news', 758)]


[('people', 1045), ('country', 528), ('time', 507), ('today', 494), ('years', 462)]


In [79]:
# let's see how many words we need to build 50% of the tweet corpus
total = 0
words_50 = []
for word, count in word_freq.most_common():
    total += count
    words_50.append(word)
    if total/sum(word_freq.values()) > 0.5:
        break
words_50

['great',
 'people',
 'president',
 'country',
 'news',
 'democrats',
 'thank',
 'trump',
 'big',
 'fake',
 'border',
 'u.s',
 'new',
 'america',
 'time',
 'today',
 'good',
 'years',
 'want',
 'media',
 'united',
 'states',
 'american',
 'china',
 'job',
 'going',
 'like',
 'bad',
 'house',
 'vote',
 'military',
 'jobs',
 'election',
 'wall',
 'crime',
 'trade',
 'state',
 'dems',
 '',
 'deal',
 'way',
 'security',
 'russia',
 'win',
 'world',
 'hunt',
 'witch',
 'collusion',
 'said',
 'day',
 'working',
 'republican',
 'hard',
 'republicans',
 'north',
 'history',
 'year',
 'strong',
 'tax',
 'work',
 'look',
 'fbi',
 'long',
 'total',
 'obama',
 'know',
 'honor',
 'congress',
 'better',
 'economy',
 'korea',
 'far',
 'hillary',
 'mueller',
 'democrat',
 'congratulations',
 'campaign',
 'need',
 'mexico',
 'foxnews',
 'dollars',
 'looking',
 'national',
 'things',
 'getting',
 'administration',
 'help',
 'report',
 'got',
 'best',
 'coming',
 'illegal',
 'meeting',
 'come',
 'forward

In [80]:
# 401 words covers 50% of the entire corpus
len(words_50)

401

In [86]:
# let's look at the least frequent words
word_freq.most_common()[-5:]

[('esposito', 1),
 ('consulting', 1),
 ('donaldjtrumpjr', 1),
 ('triggered', 1),
 ('thrives', 1)]

In [84]:
# most frequent noun chunks 
noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks
              if chunk.text.lower() not in stopwords
              and chunk.text not in misc]
noun_chunk_freq = Counter(noun_chunks)
common_noun_chunks = noun_chunk_freq.most_common(5)
print(common_noun_chunks)

[('our country', 540), ('the democrats', 360), ('china', 358), ('people', 308), ('the united states', 285)]


In [54]:
proper_nouns = [word.text.lower() for word in doc if word.pos_ == 'PROPN'
               and word.text not in misc]
proper_noun_freq = Counter(proper_nouns)
common_proper_nouns = proper_noun_freq.most_common(5)
print(common_proper_nouns)

[('president', 835), ('democrats', 739), ('trump', 645), ('fake', 588), ('news', 579)]


In [55]:
verbs = [word.lemma_ for word in doc if word.pos_=='VERB'
       and word.text not in misc]
verb_freq = Counter(verbs)
common_verbs = verb_freq.most_common(5)
print(common_verbs)

[('make', 881), ('do', 789), ('go', 764), ('thank', 704), ('want', 633)]


In [85]:
# these values seem low... Entity tagging may not be working properly
ents = [ent.text.lower().strip().strip(punct) for ent in doc.ents if ent.label_=='PERSON']
ent_freq = Counter(ents)
common_ents = ent_freq.most_common(5)
print(common_ents)

[('trump', 210), ('witch hunt', 127), ('hillary clinton', 91), ('hillary', 71), ('obama', 66)]


-------------------

In [15]:
# let's just see how this stacks up to the whole document without spacy
text = open('../data/tweets/tweet_text.txt').read()

In [41]:
# considerably fewere words due to no tokenization here
all_words = [word.lower().strip().strip(punct) for word in text.split() 
             if word.lower() not in stopwords 
             and word.lower() not in punct
            and word.lower() not in misc]
len(all_words)

136414

In [42]:
# good to see that we get the same results here as above (see word_freq)
freq = Counter(all_words)
freq.most_common(5)

[('great', 2133),
 ('people', 1065),
 ('president', 802),
 ('country', 763),
 ('news', 753)]