In [3]:
from google.colab import files
files.upload()

{}

In [4]:
import pandas as pd

## Objective

For the purpose of this analysis, I will attempt to measure the sentiment of tweets to learn whether tweets impact the number of Covid-19 cases and deaths in the United States. 

To create the dataset, I utilized the TWINT library to collect all tweets from January 1,2020 until July 10th. I then made various subsets of the tweets. For example, to measure the impact of tweets by public leaders viewed as polar opposites regarding their response to the pandemic, I collected tweets by President Trump and the Governor of New York, Andrew Cuomo. Another subset of tweets that I labeled as baseline consists of tweets by the New York Times and Washington Post - two of America's leading journalism outlets.

The purpose of creating these subsets is that the baseline tweets can be considered to be those that communicate mainly fact. While they might have op-ed columnists, we can assume that most tweets from the news reporting divisions will provide factual updates on the Covid response. By considering the two polar opposites, Trump and Cuomo, we can measure Covid outcomes, in terms of cases, after the tweets have been consumed by the public. Finally, the main Covid collection will allow us to see whether more individuals subscribed to the Trump/Cuomo tweets and how Covid cases changed, for the positive or negative, in their region.

## Obtaining Data

For the notebooks that contain the queries for the tweets gathered on TWINT, please refer to the Covid Data Queries notebook in the repo. The JSON files for these queries were used to create DataFrames.

In [5]:
#All Covid tweets
All_Covid_tweets = pd.read_json('Covid_tweets3.json',lines=True)

#All Trump tweets
Trump_Covid_tweets = pd.read_json('Trump_Covid_tweets3.json', lines=True)

#All Cuomo tweets
Cuomo_Covid_tweets = pd.read_json('Cuomo_Covid_tweets3.json',lines=True)

#Baseline Tweets
NYTimes_tweets = pd.read_json('Nytimes_Covid_tweets3.json',lines=True)
#print( len(NYTimes_tweets))
WashingtonPost_tweets = pd.read_json('Washpost_tweets3.json',lines=True)
#print( len(Washpost_tweets3.json))

#combining NYTimes and Washington Post to get Baseline Tweets
Baseline_tweets = pd.concat([NYTimes_tweets,WashingtonPost_tweets],axis=0)

#Reformatting Date columns for later merge
All_Covid_tweets['Date'] = All_Covid_tweets['date']
Trump_Covid_tweets['Date'] = Trump_Covid_tweets['date']
Cuomo_Covid_tweets['Date'] = Cuomo_Covid_tweets['date']
Baseline_tweets['Date'] = Baseline_tweets['date']

Data for Covid Cases and Deaths was collected from The COVID Tracking Project.

In [6]:
# Covid data set

covid_cases = pd.read_csv('time_series_covid_19_confirmed.csv')

#Getting US data - confirmed cases
covid_cases = covid_cases[covid_cases['Country/Region'] == 'US']
#covid_cases = covid_cases.transpose()

# Covid death data set

covid_deaths = pd.read_csv('time_series_covid_19_deaths.csv')


#Getting US data - confirmed cases

#covid_deaths = covid_deaths.transpose()
covid_deaths = covid_deaths[covid_deaths['Country/Region'] == 'US']


In [7]:
#Covid cases and deaths (still need to rename columns, from left to right = cases then deaths)
covid_data = pd.concat([covid_cases,covid_deaths],axis=0)
covid_data = covid_data.transpose()

In [8]:
covid_data = covid_data.drop(['Province/State','Country/Region','Lat','Long'])

In [9]:
covid_data.head()

Unnamed: 0,225,225.1
1/22/20,1,0
1/23/20,1,0
1/24/20,2,0
1/25/20,2,0
1/26/20,5,0


### Adding Case/Death Data on Day of the Tweet

In [10]:
#Edited column names in Excel for Merge
covid_data_formatted = pd.read_excel('covid_data_date.xlsx')
covid_data_formatted.head()

Unnamed: 0,Date,Cases,Deaths
0,1/22/20,1,0
1,1/23/20,1,0
2,1/24/20,2,0
3,1/25/20,2,0
4,1/26/20,5,0


In [11]:
#Converting all Date columns to datetime for Merge
covid_data_formatted['Date'] = pd.to_datetime(covid_data_formatted['Date'])
All_Covid_tweets['Date'] = pd.to_datetime(All_Covid_tweets['Date'])
Trump_Covid_tweets['Date'] = pd.to_datetime(Trump_Covid_tweets['Date'])
Cuomo_Covid_tweets['Date'] = pd.to_datetime(Cuomo_Covid_tweets['Date'])
Baseline_tweets['Date'] = pd.to_datetime(Baseline_tweets['Date'])

In [12]:
#All Tweet Data with corresponding case/death information
All_Covid_tweets_case_data = pd.merge(All_Covid_tweets,covid_data_formatted,on='Date')
#Trump Tweet Data with corresponding case/death information
Trump_Covid_tweets_case_data = pd.merge(Trump_Covid_tweets,covid_data_formatted,on='Date')
#Cuomo Tweet Data with corresponding case/death information
Cuomo_Covid_tweets_case_data = pd.merge(Cuomo_Covid_tweets,covid_data_formatted, on='Date')
#Baseline Tweet Data with corresponding case/death information
Baseline_tweets_case_data = pd.merge(Baseline_tweets,covid_data_formatted,on='Date')

### Adding case/death data for two weeks after original tweet

In [13]:
#Getting date two weeks from now for Covid case/death reaction to Tweets
from datetime import datetime,timedelta

N = 14
days_N_from_now = All_Covid_tweets['Date'] + timedelta(days=N)

All_Covid_tweets_case_data['14 days'] = (All_Covid_tweets_case_data['Date'] + timedelta(days=N))
Trump_Covid_tweets_case_data['14 days'] = (Trump_Covid_tweets_case_data['Date'] + timedelta(days=N))
Cuomo_Covid_tweets_case_data['14 days'] = (Cuomo_Covid_tweets_case_data['Date'] +timedelta(days=N))
Baseline_tweets_case_data['14 days'] = (Baseline_tweets_case_data['Date'] + timedelta(days=N))

In [14]:
covid_data_two_week = pd.read_excel('covid_data_14days.xlsx')
covid_data_two_week.head()

Unnamed: 0,14 days,Cases,Deaths
0,1/22/20,1,0
1,1/23/20,1,0
2,1/24/20,2,0
3,1/25/20,2,0
4,1/26/20,5,0


In [15]:
#Converting all Date columns to datetime for Merge
covid_data_two_week['14 days'] = pd.to_datetime(covid_data_two_week['14 days'])
All_Covid_tweets_case_data['14 days'] = pd.to_datetime(All_Covid_tweets_case_data['14 days'])
Trump_Covid_tweets_case_data['14 days'] = pd.to_datetime(Trump_Covid_tweets_case_data['14 days'])
Cuomo_Covid_tweets_case_data['14 days'] = pd.to_datetime(Cuomo_Covid_tweets_case_data['14 days'])
Baseline_tweets_case_data['14 days'] = pd.to_datetime(Baseline_tweets_case_data['14 days'])

In [16]:
#All Tweet Data with corresponding case/death information
All_Covid_tweets_case_data = pd.merge(All_Covid_tweets_case_data,covid_data_two_week,on='14 days')
#Trump Tweet Data with corresponding case/death information
Trump_Covid_tweets_case_data = pd.merge(Trump_Covid_tweets_case_data,covid_data_two_week,on='14 days')
#Cuomo Tweet Data with corresponding case/death information
Cuomo_Covid_tweets_case_data = pd.merge(Cuomo_Covid_tweets_case_data,covid_data_two_week, on='14 days')
#Baseline Tweet Data with corresponding case/death information
Baseline_tweets_case_data = pd.merge(Baseline_tweets_case_data,covid_data_two_week,on='14 days')

### Adding Case/Death Data for four weeks after original tweet

In [17]:
covid_data_four_week = pd.read_excel('covid_data_28days.xlsx')
covid_data_four_week.head()

Unnamed: 0,28 days,Cases,Deaths
0,1/22/20,1,0
1,1/23/20,1,0
2,1/24/20,2,0
3,1/25/20,2,0
4,1/26/20,5,0


In [18]:
#Getting date two weeks from now for Covid case/death reaction to Tweets
from datetime import datetime,timedelta

N = 28
days_N_from_now = All_Covid_tweets['Date'] + timedelta(days=N)

All_Covid_tweets_case_data['28 days'] = (All_Covid_tweets_case_data['Date'] + timedelta(days=N))
Trump_Covid_tweets_case_data['28 days'] = (Trump_Covid_tweets_case_data['Date'] + timedelta(days=N))
Cuomo_Covid_tweets_case_data['28 days'] = (Cuomo_Covid_tweets_case_data['Date'] +timedelta(days=N))
Baseline_tweets_case_data['28 days'] = (Baseline_tweets_case_data['Date'] + timedelta(days=N))

In [19]:
#Converting all Date columns to datetime for Merge
covid_data_four_week['28 days'] = pd.to_datetime(covid_data_four_week['28 days'])
All_Covid_tweets_case_data['28 days'] = pd.to_datetime(All_Covid_tweets_case_data['28 days'])
Trump_Covid_tweets_case_data['28 days'] = pd.to_datetime(Trump_Covid_tweets_case_data['28 days'])
Cuomo_Covid_tweets_case_data['28 days'] = pd.to_datetime(Cuomo_Covid_tweets_case_data['28 days'])
Baseline_tweets_case_data['28 days'] = pd.to_datetime(Baseline_tweets_case_data['28 days'])

In [20]:
#All Tweet Data with corresponding case/death information
All_Covid_tweets_case_data = pd.merge(All_Covid_tweets_case_data,covid_data_four_week,on='28 days')
#Trump Tweet Data with corresponding case/death information
Trump_Covid_tweets_case_data = pd.merge(Trump_Covid_tweets_case_data,covid_data_four_week,on='28 days')
#Cuomo Tweet Data with corresponding case/death information
Cuomo_Covid_tweets_case_data = pd.merge(Cuomo_Covid_tweets_case_data,covid_data_four_week, on='28 days')
#Baseline Tweet Data with corresponding case/death information
Baseline_tweets_case_data = pd.merge(Baseline_tweets_case_data,covid_data_four_week,on='28 days')

In [21]:
Baseline_tweets_case_data.head()

Unnamed: 0,cashtags,conversation_id,created_at,date,geo,hashtags,id,likes_count,link,mentions,name,near,photos,place,quote_url,replies_count,reply_to,retweet,retweet_date,retweet_id,retweets_count,source,time,timezone,trans_dest,trans_src,translate,tweet,urls,user_id,user_rt,user_rt_id,username,video,Date,Cases_x,Deaths_x,14 days,Cases_y,Deaths_y,28 days,Cases,Deaths
0,[],1270707306578264064,2020-06-10 13:20:05,2020-06-10,,[],1270707306578264065,209,https://twitter.com/nytimes/status/12707073065...,[],The New York Times,,[],,,33,"[{'user_id': '807095', 'username': 'nytimes'}]",False,,,61,,09:20:05,EDT,,,,After months of Amazon workers becoming ill wi...,[https://nyti.ms/3hbN3Qt],807095,,,nytimes,0,2020-06-10,2000702,113631,2020-06-24,2382426,122604,2020-07-08,3054699,132300
1,[],1270636833555308544,2020-06-10 08:40:03,2020-06-10,,[],1270636833555308544,857,https://twitter.com/nytimes/status/12706368335...,[],The New York Times,,[],,,33,"[{'user_id': '807095', 'username': 'nytimes'}]",False,,,325,,04:40:03,EDT,,,,"Haidari Wujodi, the mystic Afghan poet who for...",[https://nyti.ms/3cMzLX5],807095,,,nytimes,0,2020-06-10,2000702,113631,2020-06-24,2382426,122604,2020-07-08,3054699,132300
2,[],1270815442173603840,2020-06-10 20:29:46,2020-06-10,,[],1270815442173603841,159,https://twitter.com/washingtonpost/status/1270...,[],The Washington Post,,[],,,28,"[{'user_id': '2467791', 'username': 'washingto...",False,,,40,,16:29:46,EDT,,,,Salesforce chair and CEO Marc Benioff joins Th...,[https://twitter.com/i/broadcasts/1OdKrWPPbByGX],2467791,,,washingtonpost,0,2020-06-10,2000702,113631,2020-06-24,2382426,122604,2020-07-08,3054699,132300
3,[],1270541216308957184,2020-06-10 02:20:06,2020-06-09,,[],1270541216308957184,382,https://twitter.com/nytimes/status/12705412163...,[nytmag],The New York Times,,[],,,40,"[{'user_id': '807095', 'username': 'nytimes'},...",False,,,161,,22:20:06,EDT,,,,Rarely has a vaccine been developed in less th...,[https://nyti.ms/2YgXABq],807095,,,nytimes,0,2020-06-09,1979908,112714,2020-06-23,2347491,121847,2020-07-07,2996098,131480
4,[],1270470755889840128,2020-06-09 21:40:07,2020-06-09,,[],1270470755889840134,404,https://twitter.com/nytimes/status/12704707558...,[nytmag],The New York Times,,[],,,33,"[{'user_id': '807095', 'username': 'nytimes'},...",False,,,216,,17:40:07,EDT,,,,Rarely has a vaccine been developed in less th...,[https://nyti.ms/2UsjMXQ],807095,,,nytimes,0,2020-06-09,1979908,112714,2020-06-23,2347491,121847,2020-07-07,2996098,131480


### Combined Tweet DataFrame

In [22]:
#Tweet dataframes combined

Master_Tweet_df = pd.concat([All_Covid_tweets_case_data,Trump_Covid_tweets_case_data,Cuomo_Covid_tweets_case_data,Baseline_tweets_case_data])

# SCRUBBING OF TWEETS

In [44]:
pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/17/9d/71bd016a9edcef8860c607e531f30bd09b13103c7951ae73dd2bf174163c/tweet_preprocessor-0.6.0-py3-none-any.whl
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [52]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.MENTION, p.OPT.HASHTAG)

#Using preprocessor library to scrub tweets
tweets = Master_Tweet_df['tweet']

clean_tweets = []
for tweet in tweets:
  clean = p.clean(tweet)
  clean_tweets.append(clean)

Master_Tweet_df['clean_tweets'] = clean_tweets

In [59]:
#Sentiment Analysis
from textblob import TextBlob
tweets = Master_Tweet_df['clean_tweets']

Sentiment = []
for tweet in tweets:
  sentiment = TextBlob(tweet)
  rating = sentiment.sentiment
  Sentiment.append(rating)

Master_Tweet_df['Sentiment'] = Sentiment
#Master_Tweet_df['Sentiment'].head()

In [60]:
Master_Tweet_df.head(15)

Unnamed: 0,cashtags,conversation_id,created_at,date,geo,hashtags,id,likes_count,link,mentions,name,near,photos,place,quote_url,replies_count,reply_to,retweet,retweet_date,retweet_id,retweets_count,source,time,timezone,trans_dest,trans_src,translate,tweet,urls,user_id,user_rt,user_rt_id,username,video,Date,Cases_x,Deaths_x,14 days,Cases_y,Deaths_y,28 days,Cases,Deaths,clean_tweets,Sentiment
0,[],1265462056494477312,2020-05-27 01:57:20,2020-05-26,,[],1265462056494477312,0,https://twitter.com/WhiteWindLandon/status/126...,[],Whitewind Landon,,[],,,0,"[{'user_id': '1015727943665029121', 'username'...",False,,,0,,21:57:20,EDT,,,,"Two of the United States leading news sources,...",[https://time.com/5833945/hydroxychloroquine-c...,1015727943665029121,,,whitewindlandon,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,"Two of the United States leading news sources,...","(0.0, 0.0)"
1,[],1265462056494477312,2020-05-27 01:57:20,2020-05-26,,[],1265462056494477312,0,https://twitter.com/WhiteWindLandon/status/126...,[],Whitewind Landon,,[],,,0,"[{'user_id': '1015727943665029121', 'username'...",False,,,0,,21:57:20,EDT,,,,"Two of the United States leading news sources,...",[https://time.com/5833945/hydroxychloroquine-c...,1015727943665029121,,,whitewindlandon,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,"Two of the United States leading news sources,...","(0.0, 0.0)"
2,[],1265422039940292608,2020-05-26 23:18:19,2020-05-26,,[#covid],1265422039940292608,0,https://twitter.com/ResCon1/status/12654220399...,[],John Guardiano,,[],,https://twitter.com/ResCon1/status/12654204976...,0,"[{'user_id': '96828033', 'username': 'ResCon1'}]",False,,,0,,19:18:19,EDT,,,,…Unless U’re a physician or a nurse in a surgi...,"[https://bit.ly/3deUnbu, https://twitter.com/R...",96828033,,,rescon1,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,Unless Ure a physician or a nurse in a surgica...,"(0.0, 0.0)"
3,[],1265335385187549184,2020-05-26 23:00:14,2020-05-26,,[],1265417488705224705,0,https://twitter.com/nealhead/status/1265417488...,"[kydeplorable97, pinkacreisnuts, jordynturner1...",Neal Head,,[],,,2,"[{'user_id': '39495915', 'username': 'nealhead...",False,,,0,,19:00:14,EDT,,,,The reality is that Andy Beshear didn't create...,[],39495915,,,nealhead,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,The reality is that Andy Beshear didn't create...,"(0.0, 0.3458333333333333)"
4,[],1265410926553059328,2020-05-26 22:34:09,2020-05-26,,[],1265410926553059330,1,https://twitter.com/6121El/status/126541092655...,[],Elman,,[https://pbs.twimg.com/media/EY-khoXWoAABxIX.jpg],,,0,"[{'user_id': '544477762', 'username': '6121El'}]",False,,,1,,18:34:09,EDT,,,,"In large countries such as the United States, ...",[],544477762,,,6121el,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,"In large countries such as the United States, ...","(-0.03380952380952381, 0.419047619047619)"
5,[],1265119572702760960,2020-05-26 22:22:16,2020-05-26,,[#waytogojoe],1265407935611498497,0,https://twitter.com/roswell32/status/126540793...,[terrymoran],Max,,[],,,0,"[{'user_id': '1376876486', 'username': 'roswel...",False,,,0,,18:22:16,EDT,,,,"Right now, it's a very Good look for a Preside...",[],1376876486,,,roswell32,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,"Right now, it's a very Good look for a Preside...","(0.2976190476190476, 0.6776190476190477)"
6,[],1265401616879894528,2020-05-26 21:57:10,2020-05-26,,[],1265401616879894529,13,https://twitter.com/gary_lyman/status/12654016...,[],Gary Lyman,,[],,,3,"[{'user_id': '2535654342', 'username': 'gary_l...",False,,,6,,17:57:10,EDT,,,,United States has officially surpassed the gri...,[],2535654342,,,gary_lyman,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,United States has officially surpassed the gri...,"(-0.3327272727272727, 0.6409090909090909)"
7,[],1265390741800587264,2020-05-26 21:13:57,2020-05-26,,[],1265390741800587267,0,https://twitter.com/oldnavy1968/status/1265390...,[realdonaldtrump],Randall D. York,,[],,,0,"[{'user_id': '747094959786983424', 'username':...",False,,,0,,17:13:57,EDT,,,,The United States has more confirmed COVID-19 ...,[],747094959786983424,,,oldnavy1968,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,The United States has more confirmed COVID-19 ...,"(0.2333333333333333, 0.4166666666666667)"
8,[],1265383558295797760,2020-05-26 20:45:24,2020-05-26,,[],1265383558295797761,0,https://twitter.com/DaBlazinJr/status/12653835...,[],Da Blazin Jr,,[],,,0,"[{'user_id': '3071297554', 'username': 'DaBlaz...",False,,,0,,16:45:24,EDT,,,,The average age of deceased and COVID-19 posit...,[],3071297554,,,dablazinjr,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,The average age of deceased and COVID-19 posit...,"(-0.007575757575757579, 0.3818181818181818)"
9,[],1264956000089636864,2020-05-26 18:15:33,2020-05-26,,[],1265345847556026370,0,https://twitter.com/SmartPe53402672/status/126...,[aclu],Smart Person,,[],,,0,"[{'user_id': '1261365461440233480', 'username'...",False,,,0,,14:15:33,EDT,,,,You are some of the stupidest people how do yo...,[],1261365461440233480,,,smartpe53402672,0,2020-05-26,1689162,99952,2020-06-09,1979908,112714,2020-06-23,2347491,121847,You are some of the stupidest people how do yo...,"(-0.57125, 0.64)"
