In [9]:
import pandas as pd
pd.set_option('display.max_rows', 500)
import numpy as np
import datetime
import re
import string
import nltk
import sklearn
from nltk.tokenize import MWETokenizer 
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
#############################################################
#Data graciously taken from https://www.thetrumparchive.com/#
#############################################################
df_tweets = pd.read_csv('trump_tweets_raw.csv')
pd.to_datetime(df_tweets['date'])
df_tweets.sort_values('date',ascending = False, inplace = True,ignore_index = True)
df_tweets=df_tweets.rename(columns = {'date':'DateTime'})
df_tweets['DateTime'] = [datetime.datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in df_tweets["DateTime"]]
df_tweets['Date'] = [datetime.datetime.date(d) for d in df_tweets['DateTime']] 
df_tweets['Time'] = [datetime.datetime.time(d) for d in df_tweets['DateTime']] 
#The Day trump became president
mask = (df_tweets['Date'] >= datetime.date(2017, 1, 20))
df_trump = df_tweets[mask]
len(df_trump)

24110

In [37]:
def clean_word_tokenize(text):
    '''In order: remove punctuation, make lowercase, remove numbers,
    tokenize by words and multi words, remove stop-words'''
    
        #Set your own stopwords here:
    stop_words = stopwords.words('english')
    additional_stop_words = ['rt']
    stop_words.extend(additional_stop_words)
    
        #Put your own group of words in the MWETokenizer
    mwe_tokenizer = MWETokenizer([('make','america','great','again'),('america','first')])
    
        #Begin Cleaning
    text_no_punctuation = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text_lower = text_no_punctuation.lower()
    text_no_num = re.sub('\w*\d\w*','',text_lower)
    tokenize_text = mwe_tokenizer.tokenize(word_tokenize(text_no_num))
    no_stop_text = [word for word in tokenize_text if not word in stop_words]
    return no_stop_text

In [32]:
df_trump['tokenized_text'] = df_trump['text'].apply(clean_word_tokenize)
df_trump.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,DateTime,Date,Time,tokenized_text
0,1320096006038380544,Just landed in Ohio. See you in a little while!,f,f,Twitter for iPhone,37662,4948,2020-10-24 20:13:08,2020-10-24,20:13:08,"[landed, ohio, see, little]"
1,1320095628546899968,Nobody is showing up for Obama’s hate laced sp...,f,f,Twitter for iPhone,57061,10961,2020-10-24 20:11:38,2020-10-24,20:11:38,"[nobody, showing, obama, ’, hate, laced, speec..."
2,1320076630065373184,AMERICA FIRST!,f,f,Twitter for iPhone,89661,14937,2020-10-24 18:56:09,2020-10-24,18:56:09,[america_first]
3,1320076502839459842,MAKE AMERICA GREAT AGAIN!,f,f,Twitter for iPhone,152417,24873,2020-10-24 18:55:38,2020-10-24,18:55:38,[make_america_great_again]
4,1320076289034850306,Joe Biden = Biggest Tax Increase In History an...,f,f,Twitter for iPhone,57012,11744,2020-10-24 18:54:47,2020-10-24,18:54:47,"[joe, biden, biggest, tax, increase, history, ..."


In [33]:
#Make a CountVectorizer DataFrame
cv = CountVectorizer(analyzer=lambda x:x)
vectorized_words = cv.fit_transform(df_trump['tokenized_text']).toarray()
col_names = cv.get_feature_names()
df_vectorized = pd.DataFrame(vectorized_words, columns = col_names)

In [34]:
df_vectorized.head()

Unnamed: 0,aaa,aap,aapsonline,aaron,ab,abaco,abandon,abandoned,abaondon…,abbas,...,🤦🏼‍♂️🤦🏼‍♂️🤦🏼‍♂️🤦🏼‍♂️🤦🏼‍♂️🤦🏼‍♂️👇🏻👇🏻,🤨,🤳,🥳,🥳🎂,🥳🙏🙌,🦃,🦅✈️🇺🇸,🧐🧐🧐🧐,🧵thread🧵
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
col_sums = df_vectorized.sum(axis = 0)
col_sums.head()

aaa           1
aap           9
aapsonline    1
aaron         1
ab            2
dtype: int64

In [36]:
col_sums.sort_values(ascending = False).head(50)

’                  5911
great              3447
amp                3095
“                  3095
”                  2811
president          2738
realdonaldtrump    2358
people             2003
trump              1996
democrats          1665
thank              1530
us                 1491
news               1459
country            1284
new                1265
big                1200
fake               1182
today              1072
get                1071
would              1048
many               1029
never              1016
american           1013
time                911
one                 873
want                870
years               866
media               859
biden               830
america             828
house               825
border              820
whitehouse          818
good                794
like                792
vote                780
much                778
even                777
states              769
back                749
going               741
china           

['lews', 'party', 'awesome']