In [1]:
%reset -fs

import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

# pd.set_option('max_columns', 100)
pd.set_option("display.max_colwidth", 1)

In [2]:
df = pd.read_csv('tweets_01-08-2021.csv', encoding='utf-8')

In [3]:
len(df)

56571

In [4]:
df.head(30)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created our economic problems.,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,"I was thrilled to be back in the Great city of Charlotte, North Carolina with thousands of hardworking American Patriots who love our Country, cherish our values, respect our laws, and always put AMERICA FIRST! Thank you for a wonderful evening!! #KAG2020 https://t.co/dNJZfRsl9y",f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance court obtained by CBS News questions where there will be further disciplinary action and cho…,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,"The Unsolicited Mail In Ballot Scam is a major threat to our Democracy, &amp; the Democrats know it. Almost all recent elections using this system, even though much smaller &amp; with far fewer Ballots to count, have ended up being a disaster. Large numbers of missing Ballots &amp; Fraud!",f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of events here about Comey's apparent leaking to compliant media. If you read those articles and tho…,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,RT @WhiteHouse: President @realDonaldTrump announced historic steps to protect the Constitutional right to pray in public schools! https://…,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,Getting a little exercise this morning! https://t.co/fyAAcbhbgk,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
7,1319501865625784320,https://t.co/4qwCKQOiOw,f,f,Twitter for iPhone,130822,19127,2020-10-23 04:52:14,f
8,1319500520126664705,https://t.co/VlEu8yyovv,f,f,Twitter for iPhone,153446,20275,2020-10-23 04:46:53,f
9,1319500501269041154,https://t.co/z5CRqHO8vg,f,f,Twitter for iPhone,102150,14815,2020-10-23 04:46:49,f


In [5]:
df.groupby('isFlagged').count()

Unnamed: 0_level_0,id,text,isRetweet,isDeleted,device,favorites,retweets,date
isFlagged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,56267,56267,56267,56267,56267,56267,56267,56267
t,304,304,304,304,304,304,304,304


In [6]:
# Text preprocessing steps - remove numbers, URL, captial letters and punctuation
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub(r'[^\w\s]', ' ',str(x).lower())
no_url = lambda x: re.sub(r'http\S+', '', x)
clean = lambda x: x.replace('\n', '')
english_only = lambda string: re.sub(r'[^\x00-\x7f]', "", string)

df['text'] = df.text.map(alphanumeric).map(no_url).map(punc_lower).map(clean).map(english_only)
df.head(50)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
7,1319501865625784320,,f,f,Twitter for iPhone,130822,19127,2020-10-23 04:52:14,f
8,1319500520126664705,,f,f,Twitter for iPhone,153446,20275,2020-10-23 04:46:53,f
9,1319500501269041154,,f,f,Twitter for iPhone,102150,14815,2020-10-23 04:46:49,f


In [7]:
df['text'] = df['text'].apply(lambda x: x.strip()).replace('', np.nan) # fill empty cells w/ Null values

In [8]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

In [9]:
df['text'] =  df['text'].apply(lambda sentence: [lemmatizer.lemmatize for w in nltk.word_tokenize(sentence)])

In [10]:
df.head(50)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
7,1319501865625784320,,f,f,Twitter for iPhone,130822,19127,2020-10-23 04:52:14,f
8,1319500520126664705,,f,f,Twitter for iPhone,153446,20275,2020-10-23 04:46:53,f
9,1319500501269041154,,f,f,Twitter for iPhone,102150,14815,2020-10-23 04:46:49,f


In [11]:
df = df.dropna() #drop null rows
df.head(50)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
11,1319496349092511744,rt erictrump,t,f,Twitter for iPhone,0,8921,2020-10-23 04:30:19,f
12,1236502342121541632,rt gopchairwoman the economic boom continues jobs added in february jobs added since realdonaldtrump was electedunemploym,t,f,Twitter for iPhone,0,8681,2020-03-08 04:01:46,f
13,1225835449379258368,rt tomfitton vindman s behavior is a scandal he should be removed from the realdonaldtrump white house asap to protect our foreign poli,t,f,Twitter for iPhone,0,7679,2020-02-07 17:35:20,f


In [12]:
from datetime import datetime as dt

In [13]:
df["date"] = pd.to_datetime(df.date , format="%Y/%m/%d %H:%M:%S") #create date_time column

In [14]:
for tweet in df['text'].sample(10).values:
    print(tweet)

rt  senpatroberts  master sgt  david royer is nothing short of a hero after taking action into his own hands when he stopped an active shoo
i will be announcing my supreme court nominee on saturday  at the white house  exact time tba
realdonaldtrump congrats on all your success  you are a great role model      thanks
thejbish  billmaher you are right but i always believe in hitting back  and thanks
obama spied on our campaign   and got caught
the democrats are fixers  and they are working overtime to fix the impeachment  process  in order to hurt the republican party and me  nancy pelosi should instead fix her broken district and corrupt adam should clean up  amp  manage the california forests which are always burning
rt  teamtrump  white house press secretary  kayleighmcenany  president  realdonaldtrump stands against defunding our police  caving to mob
fix it  and fast  the current recipient has no chance  and won t even try  as i have often said  baltimore is last in everything  wha

In [15]:
corpus = df['text']

In [16]:
stop_words = ENGLISH_STOP_WORDS

In [159]:
tfidf = TfidfVectorizer(stop_words=stop_words,max_df=0.4,min_df=10)
tweet_word_matrix = tfidf.fit_transform(corpus)
vocab = tfidf.get_feature_names_out()

In [160]:
tweet_word_matrix.shape

(55247, 5924)

In [161]:
nmf = NMF(n_components=30)

nmf.fit(tweet_word_matrix)



NMF(n_components=30)

In [162]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)

In [163]:
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')
tweet_topic_matrix_df['tweets'] = df['text']
tweet_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,tweets
0,0.0,0.0,0.0,0.0,0.003177,0.0,0.000133,8e-05,0.0,0.0,...,0.0,0.00286,0.0,4.9e-05,0.007383,0.001913,0.002879,0.001479,0.033061,republicans and democrats have both created our economic problems
1,0.0,0.0,0.02844,0.021907,0.000493,0.0,0.0,0.00026,0.0,0.024933,...,0.047638,0.006257,0.0,0.000122,0.0,0.0,0.0,0.003054,0.0,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening
2,0.0,3.3e-05,0.0,0.000137,0.030336,0.0,0.000423,0.0,0.000107,0.0,...,0.0,0.000535,0.000347,0.001441,0.0,0.0,0.0,0.000329,0.0,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho
3,0.0,0.0,0.0,0.0,0.001158,0.0,0.000232,0.0,5.4e-05,0.0,...,0.0,0.001184,3.5e-05,0.001443,0.00294,0.000427,0.0,0.0,0.020035,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud
4,3e-06,9.5e-05,0.0,6.1e-05,0.030486,0.000211,0.001203,0.0,0.000366,0.0,...,0.0,0.0,0.000297,0.00103,0.001519,0.000169,0.0,0.0,0.0,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho


In [164]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
# word_topic_matrix_df.tail(5)

In [165]:
# for tweet in tweet_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)['tweets'].values:
#     print(tweet)
#     print()

In [166]:
word_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
realdonaldtrump,9.438829,0.0,0.0,0.0,0.018993,0.0,0.0,0.021412,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
man,0.351734,0.034222,0.009116,0.028294,0.0,0.041095,0.135874,0.0,0.042168,0.0,...,0.047501,0.06068,0.0,0.099462,0.019837,0.002264,0.0,0.001867,0.0,0.009385
best,0.290143,0.032493,0.010969,0.0,0.0,0.187867,0.379132,0.0,0.0,0.01637,...,0.105545,0.009796,0.0,0.016461,0.0,0.0,0.704675,0.014881,0.0,0.048272
wait,0.257981,0.008446,0.0,0.015384,0.0,0.016361,0.0,0.0,0.0,0.0,...,0.009496,0.041621,0.006849,0.0,0.030762,0.0,0.0,0.0,0.004925,0.000419
awesome,0.236499,0.041085,0.000351,0.006054,0.0,0.035808,0.04197,0.0,0.0,0.0,...,0.001564,0.025236,0.0,0.0,0.016937,0.0,0.010716,0.0,0.006494,0.0
yes,0.231798,0.024677,0.0,0.005797,0.0,0.000237,0.007084,0.0,0.0,0.0,...,0.0,0.0,0.015329,0.026314,0.0,0.022061,0.001337,0.0,0.0,0.0
apprenticenbc,0.209675,0.001174,0.0,0.009877,0.0,0.0,0.032618,0.0,0.005274,0.0,...,0.0128,0.208487,0.0,0.0,0.0,0.0,0.05946,0.0,0.0,0.016223
need,0.204374,0.0,0.0,0.0,0.0,0.109515,0.387165,0.0,0.051364,0.075063,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03346,0.015787
agree,0.195283,0.025678,0.003196,0.0,0.0,0.053886,0.013602,0.0,0.000891,0.0,...,0.0,0.0,0.0,0.030576,0.0,0.065932,0.001964,0.0,0.0,0.021859
apprentice,0.152598,0.0,0.0,0.025504,0.0,0.0,0.0,0.0,0.011741,0.0,...,0.01121,0.245027,0.009457,0.016869,0.000305,0.0,0.060067,0.0,0.0,0.042937


In [167]:
def top_tweets(tweet_topic_matrix_df, topic, n_tweets):
    return (tweet_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_tweets)['tweets']
            .values)

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_words))[topic]

In [168]:
# for tweet in top_tweets(tweet_topic_matrix_df, 'topic_1', 15):
#     print(tweet)
#     print()

In [169]:
top_words(word_topic_matrix_df, 'topic_0', 10)

realdonaldtrump    9.438829
man                0.351734
best               0.290143
wait               0.257981
awesome            0.236499
yes                0.231798
apprenticenbc      0.209675
need               0.204374
agree              0.195283
apprentice         0.152598
Name: topic_0, dtype: float64

In [170]:
top_words(word_topic_matrix_df, 'topic_1', 10)

thanks       4.945975
nice         0.119879
luck         0.042967
awesome      0.041085
billmaher    0.039915
man          0.034222
best         0.032493
amazing      0.026891
agree        0.025678
yes          0.024677
Name: topic_1, dtype: float64

In [171]:
top_words(word_topic_matrix_df, 'topic_2', 10)

thank           4.802150
maga            0.253064
nice            0.161843
support         0.078770
pennsylvania    0.077294
americafirst    0.055215
carolina        0.053202
florida         0.053165
rating          0.052342
honor           0.050981
Name: topic_2, dtype: float64

In [29]:
# for tweet in top_tweets(tweet_topic_matrix_df, 'topic_2', 5):
#     print(tweet)

In [30]:
# for tweet in top_tweets(tweet_topic_matrix_df, 'topic_2', 5):
#     print(tweet)
#     print()

In [31]:
df_before = df[(df['date'] <= "2017-01-19")]
df_before

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1601,336155055575150594,yes,f,f,Twitter for Android,0,1,2013-05-19 16:23:05,f
1615,274563166384582657,thanks you will love it,f,f,Twitter Web Client,1,4,2012-11-30 17:18:54,f
1620,274562988646727681,celebapprentice bretmichaels you won t be disappointed,f,f,Twitter Web Client,0,3,2012-11-30 17:18:12,f
1696,284759862976581632,wandasalley thanks have fun,f,f,Twitter Web Client,1,2,2012-12-28 20:36:56,f
...,...,...,...,...,...,...,...,...,...
45874,815449868739211265,rt donaldjtrumpjr happy new year everyone newyear family vacation familytime,t,f,Twitter for iPhone,0,5548,2017-01-01 06:49:33,f
45875,815433444591304704,rt erictrump was such an incredible year for our entire family my beautiful wife laraleatrump made it even better,t,f,Twitter for iPhone,0,5601,2017-01-01 05:44:17,f
45876,815433217595547648,rt reince happy new year god s blessings to you all looking forward to incredible things in realdonaldtrump will make america,t,f,Twitter for iPhone,0,5811,2017-01-01 05:43:23,f
45877,815432169464197121,rt danscavino on behalf of our next potus amp teamtrump happynewyear america t co,t,f,Twitter for iPhone,0,4562,2017-01-01 05:39:13,f


In [32]:
df_after = df[(df['date'] > "2017-01-20")]
df_after

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
...,...,...,...,...,...,...,...,...,...
56566,1319485303363571714,rt randpaul i don t know why joebiden thinks he can continue to lie about this he wants to ban fracking and end all fossil fuels like,t,f,Twitter for iPhone,0,20683,2020-10-23 03:46:25,f
56567,1319484210101379072,rt elisestefanik president realdonaldtrump excels at communicating directly to the american people joe biden communicates to the dc b,t,f,Twitter for iPhone,0,9869,2020-10-23 03:42:05,f
56568,1319444420861829121,rt teamtrump live presidential debate text vote to,t,f,Twitter for iPhone,0,8197,2020-10-23 01:03:58,f
56569,1319384118849949702,just signed an order to support the workers of delphi corporation and make sure that we protect the pensions of all american workers obama biden failed american workers and failed the workers of delphi i always put american workers first,f,f,Twitter for iPhone,176289,36001,2020-10-22 21:04:21,f


In [33]:
df_before2 = df_before['text']
df_before2.iloc[0]

'republicans and democrats have both created our economic problems'

In [34]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [35]:
analyzer = SentimentIntensityAnalyzer()


In [36]:
for i in 

SyntaxError: invalid syntax (3996493703.py, line 1)

In [None]:
score = analyzer.polarity_scores(df_before2.iloc[0])

In [None]:
print(score)