In [1]:
%reset -fs

import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

# pd.set_option('max_columns', 100)
pd.set_option("display.max_colwidth", 1)

In [2]:
df = pd.read_csv('tweets_01-08-2021.csv', encoding='utf-8')

In [3]:
len(df)

56571

In [4]:
df.head(30)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created our economic problems.,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,"I was thrilled to be back in the Great city of Charlotte, North Carolina with thousands of hardworking American Patriots who love our Country, cherish our values, respect our laws, and always put AMERICA FIRST! Thank you for a wonderful evening!! #KAG2020 https://t.co/dNJZfRsl9y",f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance court obtained by CBS News questions where there will be further disciplinary action and cho…,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,"The Unsolicited Mail In Ballot Scam is a major threat to our Democracy, &amp; the Democrats know it. Almost all recent elections using this system, even though much smaller &amp; with far fewer Ballots to count, have ended up being a disaster. Large numbers of missing Ballots &amp; Fraud!",f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of events here about Comey's apparent leaking to compliant media. If you read those articles and tho…,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,RT @WhiteHouse: President @realDonaldTrump announced historic steps to protect the Constitutional right to pray in public schools! https://…,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,Getting a little exercise this morning! https://t.co/fyAAcbhbgk,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
7,1319501865625784320,https://t.co/4qwCKQOiOw,f,f,Twitter for iPhone,130822,19127,2020-10-23 04:52:14,f
8,1319500520126664705,https://t.co/VlEu8yyovv,f,f,Twitter for iPhone,153446,20275,2020-10-23 04:46:53,f
9,1319500501269041154,https://t.co/z5CRqHO8vg,f,f,Twitter for iPhone,102150,14815,2020-10-23 04:46:49,f


In [5]:
df.groupby('isFlagged').count()

Unnamed: 0_level_0,id,text,isRetweet,isDeleted,device,favorites,retweets,date
isFlagged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,56267,56267,56267,56267,56267,56267,56267,56267
t,304,304,304,304,304,304,304,304


In [6]:
# Text preprocessing steps - remove numbers, URL, captial letters and punctuation
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub(r'[^\w\s]', ' ',str(x).lower())
no_url = lambda x: re.sub(r'http\S+', '', x)
clean = lambda x: x.replace('\n', '')

df['text'] = df.text.map(alphanumeric).map(no_url).map(punc_lower).map(clean)
df.head(50)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
7,1319501865625784320,,f,f,Twitter for iPhone,130822,19127,2020-10-23 04:52:14,f
8,1319500520126664705,,f,f,Twitter for iPhone,153446,20275,2020-10-23 04:46:53,f
9,1319500501269041154,,f,f,Twitter for iPhone,102150,14815,2020-10-23 04:46:49,f


In [7]:
df['text'] = df['text'].apply(lambda x: x.strip()).replace('', np.nan) # fill empty cells w/ Null values

In [8]:
df.head(50)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
7,1319501865625784320,,f,f,Twitter for iPhone,130822,19127,2020-10-23 04:52:14,f
8,1319500520126664705,,f,f,Twitter for iPhone,153446,20275,2020-10-23 04:46:53,f
9,1319500501269041154,,f,f,Twitter for iPhone,102150,14815,2020-10-23 04:46:49,f


In [9]:
df = df.dropna() #drop null rows
df.head(50)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,republicans and democrats have both created our economic problems,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,rt whitehouse president realdonaldtrump announced historic steps to protect the constitutional right to pray in public schools,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f
6,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f
11,1319496349092511744,rt erictrump,t,f,Twitter for iPhone,0,8921,2020-10-23 04:30:19,f
12,1236502342121541632,rt gopchairwoman the economic boom continues jobs added in february jobs added since realdonaldtrump was electedunemploym,t,f,Twitter for iPhone,0,8681,2020-03-08 04:01:46,f
13,1225835449379258368,rt tomfitton vindman s behavior is a scandal he should be removed from the realdonaldtrump white house asap to protect our foreign poli,t,f,Twitter for iPhone,0,7679,2020-02-07 17:35:20,f


In [10]:
from datetime import datetime as dt

In [11]:
df["date"] = pd.to_datetime(df.date , format="%Y/%m/%d %H:%M:%S") #create date_time column

In [12]:
for tweet in df['text'].sample(10).values:
    print(tweet)

rt  nhc_atlantic  hurricane force winds are expected to extend well inland over portions of the florida panhandle and portions of southeast
lkdusa   lkdusa now you need not wonder why we are attracted to a strong leader like  realdonaldtrump  the rest don t cut it  all wimps
rt  seanmdav  lawmakers in both chambers have demanded that ic ig michael atkinson explain why he backdated to august secret changes he mad
boardroom time  which team do you think had the best presentation   celebapprentice
realdonaldtrump happy the apprentice is back to two hours having my weekly apprentice party   you will love it tonight
even though i am not mandated by law to do so  i will be leaving my busineses before january   so that i can focus full time on the
melania and i offer our deepest condolences to the family of otto warmbier  full statement
we will soon be at a point with our incompetent politicians where we will be treating illegal immigrants better than our veterans
just had a wonderful convers

In [13]:
corpus = df['text']

In [14]:
stop_words = ENGLISH_STOP_WORDS

In [16]:
tfidf = TfidfVectorizer(stop_words=stop_words)
tweet_word_matrix = tfidf.fit_transform(corpus)
vocab = tfidf.get_feature_names_out()

In [17]:
nmf = NMF(n_components=5)

nmf.fit(tweet_word_matrix)



NMF(n_components=5)

In [18]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)

In [20]:
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')
tweet_topic_matrix_df['tweets'] = df['text']
tweet_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,tweets
0,0.0,0.0,0.0,0.0,0.018889,republicans and democrats have both created our economic problems
1,0.0,0.034618,0.043237,0.000482,0.013938,i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening
2,0.010155,0.0,0.0,0.0,0.008845,rt cbs_herridge read letter to surveillance court obtained by cbs news questions where there will be further disciplinary action and cho
3,0.0,0.0,0.0,0.0,0.035309,the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud
4,0.01044,0.0,0.0,0.0,0.005565,rt mzhemingway very friendly telling of events here about comey s apparent leaking to compliant media if you read those articles and tho


In [23]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df.tail(50)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
今日は,0.000219,0.0,0.0,0.0,0.0
令和初の国賓としてお迎えしたトランプ大統領と千葉でゴルフです,0.00072,0.0,0.0,0.0,0.0
初の,0.000425,0.0,0.0,0.0,0.0
初めてとなる日米印三か国による首脳会談を行いました,0.000236,0.0,0.0,0.0,0.0
北朝鮮問題への対応,0.000285,0.0,0.0,0.0,0.0
史上初めてのことです,0.000157,0.0,0.0,0.0,0.0
大半を北朝鮮問題に費やし,0.000219,0.0,0.0,0.0,0.0
天皇,0.000217,0.0,0.0,0.0,6.9e-05
新しい令和の時代も日米同盟をさらに揺るぎないものとしていきたいと考えています,0.00072,0.0,0.0,0.0,0.0
日本のために最善となるようベストを尽くす,0.000219,0.0,0.0,0.0,0.0


In [25]:
for tweet in tweet_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)['tweets'].values:
    print(tweet)
    print()

thank you

rt  paulsperry_  comey refutes senators  assertion he spied on trump  when in fact he sent pientka to surveil trump at a security briefing

rt  jim_jordan  yesterday the democrats impeached our president today they passed a bill to raise your taxes maybe next year they ll tr

rt  teamtrump  watch  video footage from georgia shows suitcases filled with ballots pulled from under a table after supervisors told poll

under my leadership  we achieved the most secure border in u s  history  my opponent s insane immigration plan completely eliminates u s  borders by implementing nationwide catch and release  joe biden would make every community into a sanctuary city for violent criminals   vote

biden wants to lockdown our country  maybe for years  crazy  there will be no lockdowns  the great american comeback is underway

rt  breitbartnews   they are like that one girlfriend you had that s just an idiot that believes every lie some guy tells her at a bar   ht

rt  greggjarrett  br

In [26]:
word_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
realdonaldtrump,4.235146,0.063229,0.096904,0.235844,0.0
rt,3.484097,0.0,0.0,0.0,0.0
president,1.040853,0.0,0.0,0.0,0.872159
run,0.405699,0.0,0.0,0.039007,0.321345
whitehouse,0.399999,0.0,0.01047,0.0,0.010486
danscavino,0.258477,0.0,0.0,0.0,0.0
teamtrump,0.217946,0.0,0.0,0.0,0.025089
love,0.165735,0.104446,0.024814,0.104105,0.15115
live,0.159775,0.0,0.002083,0.0,0.063161
vote,0.148766,0.038601,0.071457,0.013231,0.404754


In [29]:
def top_tweets(tweet_topic_matrix_df, topic, n_tweets):
    return (tweet_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_tweets)['tweets']
            .values)

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_words))[topic]

In [30]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_1', 15):
    print(tweet)
    print()

in a letter to me sent by kim jong un  he stated  very nicely  that he would like to meet and start negotiations as soon as the joint u s  south korea joint exercise are over  it was a long letter  much of it complaining about the ridiculous and expensive exercises  it was

realdonaldtrump rubio is irresponsible on finances and doesn t show up for work  who would hire him  not the american people

rt  sbagov  what s the process for applying for an sba disaster loan and what documentation is required  this fact sheet breaks it down  ht

georgeptransue  bw  trumpwinery  erictrump  thank you

joe biden cannot lead our country  because he does not believe in our country  at biden s convention  they decried america as a wicked  sinful nation  destined for a fate of doom and despair  they even removed the words  under god  from the pledge of allegiance   twice

senate intelligence panel found absolutely no evidence of collusion  there was no evidence that the trump campaign conspired with th

In [31]:
top_words(word_topic_matrix_df, 'topic_1', 10)

thank                    4.097301
makeamericagreatagain    0.346896
maga                     0.325483
nice                     0.159639
love                     0.104446
support                  0.095562
new                      0.087258
carolina                 0.086792
pennsylvania             0.083558
honor                    0.079850
Name: topic_1, dtype: float64

In [32]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_2', 5):
    print(tweet)
    print()

rt  donaldjtrumpjr  imagine what these crooks would have done to the people they were investing in their hoax had they done the same

rt  governorlittle  i am a strong advocate of healthy lands and waters  and i support president trump s plans to update the national enviro

rt  danscavino

thank you joe  so good for our workers

the media has not covered my long shot great finish in iowa fairly  brought in record voters and got second highest vote total in history



In [33]:
top_words(word_topic_matrix_df, 'topic_2', 10)

great      3.026110
america    1.742726
make       1.435488
job        0.222848
honor      0.167891
let        0.134409
work       0.116424
day        0.115719
book       0.114949
country    0.111274
Name: topic_2, dtype: float64

In [None]:
df_before = df[(df['date'] <= "2017-01-19")]
df_before

In [None]:
df_after = df[(df['date'] > "2017-01-20")]
df_after

In [None]:
df_before2 = df_before['text']
df_before2.iloc[0]

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()


In [None]:
for i in 

In [None]:
score = analyzer.polarity_scores(df_before2.iloc[0])

In [None]:
print(score)