In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

## Obtaining Tweet Data

For the notebooks that contain the queries for the tweets gathered on TWINT, please refer to the Covid Data Queries notebook in the repo. The JSON files for these queries were used to create DataFrames.

In [2]:
#All Covid tweets
All_Covid_tweets = pd.read_json('tweets/Covid_tweets4.json',lines=True)

#All Trump tweets
Trump_Covid_tweets = pd.read_json('tweets/Trump_Covid_tweets4.json', lines=True)

#All Cuomo tweets
Cuomo_Covid_tweets = pd.read_json('tweets/Cuomo_Covid_tweets4.json',lines=True)

#Baseline Tweets
NYTimes_tweets = pd.read_json('tweets/Nytimes_Covid_tweets4.json',lines=True)
#print( len(NYTimes_tweets))
WashingtonPost_tweets = pd.read_json('tweets/Washpost_tweets4.json',lines=True)
#print( len(Washpost_tweets3.json))

#combining NYTimes and Washington Post to get Baseline Tweets
Baseline_tweets = pd.concat([NYTimes_tweets,WashingtonPost_tweets],axis=0)

#Reformatting Date columns for later merge
All_Covid_tweets['date'] = pd.to_datetime(All_Covid_tweets['date'], format='%Y%m%d')
Trump_Covid_tweets['date'] = pd.to_datetime(Trump_Covid_tweets['date'], format='%Y%m%d')
Cuomo_Covid_tweets['date'] = pd.to_datetime(Cuomo_Covid_tweets['date'], format='%Y%m%d')
Baseline_tweets['date'] = pd.to_datetime(Baseline_tweets['date'], format='%Y%m%d')

In [3]:
#Combining all Tweet DFs into one
Master_Tweet_df = pd.concat([All_Covid_tweets,Trump_Covid_tweets,
                             Cuomo_Covid_tweets,Baseline_tweets],axis=0)

number_of_tweets = len(Master_Tweet_df)
print('The number of tweets in the dataset is:',number_of_tweets)

The number of tweets in the dataset is: 17659


## Obtaining Covid Data

Data for Covid Cases and Deaths was collected from The COVID Tracking Project.

In [4]:
# Covid data set
covid_data = pd.read_excel('covid data/data.8.08.2020.xls')

# formatting the date column to datetime
covid_data['date'] = pd.to_datetime(covid_data['date'], format='%Y%m%d')

In [5]:
covid_data = covid_data[['date','positive','death']]
covid_data.head()

Unnamed: 0,date,positive,death
0,2020-08-07,4913663,152816.0
1,2020-08-06,4852143,151483.0
2,2020-08-05,4797959,150232.0
3,2020-08-04,4745694,148807.0
4,2020-08-03,4694126,147631.0


### Adding Case/Death Data on Day of the Tweet

In [6]:
#merging the two datasets for all Covid data in one place
Master_Tweet_dataset = pd.merge(Master_Tweet_df,covid_data,on='date')
Master_Tweet_dataset.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,positive,death
0,1291523532363636736,1291523532363636736,2020-08-06 23:56:20,2020-08-06,19:56:20,EDT,228022886,acai_w,Angelo Wijdh,,...,,,,"[{'user_id': '228022886', 'username': 'ACAI_W'}]",,,,,4852143,151483.0
1,1291522763233071105,1291451746724741120,2020-08-06 23:53:16,2020-08-06,19:53:16,EDT,1134860643528577026,brittaswenson,Britta Swenson,,...,,,,"[{'user_id': '1134860643528577026', 'username'...",,,,,4852143,151483.0
2,1291520828148871168,1291520828148871168,2020-08-06 23:45:35,2020-08-06,19:45:35,EDT,1080245442732974081,sallywo42411402,Sally Wong,,...,,,,"[{'user_id': '1080245442732974081', 'username'...",,,,,4852143,151483.0
3,1291520253357326338,1291520253357326336,2020-08-06 23:43:18,2020-08-06,19:43:18,EDT,2431027482,ultimate1us,DENSMORE,,...,,,,"[{'user_id': '2431027482', 'username': 'ultima...",,,,,4852143,151483.0
4,1291519814339629059,1291518619197874176,2020-08-06 23:41:33,2020-08-06,19:41:33,EDT,2974733727,taffygeek,Rob Chappell,,...,,,,"[{'user_id': '2974733727', 'username': 'taffyg...",,,,,4852143,151483.0


### Dropping Unneeded Columns

In [7]:
#dropping unnecessary columns

Master_Tweet_dataset = Master_Tweet_dataset.drop(['cashtags', 'conversation_id', 'hashtags',
       'id','link', 'mentions', 'name', 'near', 'photos',
       'place', 'quote_url','reply_to', 'retweet',
       'retweet_date', 'retweet_id','source', 'time',
       'timezone', 'trans_dest', 'trans_src', 'translate','urls',
       'user_id', 'user_rt', 'user_rt_id',],axis=1)

### SCRUBBING OF TWEETS

In [8]:
#pip install textfeatures
import textfeatures as tf
Master_Tweet_dataset.columns

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jamaalsmith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['created_at', 'date', 'username', 'tweet', 'replies_count',
       'retweets_count', 'likes_count', 'video', 'geo', 'positive', 'death'],
      dtype='object')

In [9]:
#Getting ride of duplicative column
Master_Tweet_dataset=Master_Tweet_dataset.drop('created_at',axis=1)
Master_Tweet_df.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1291523532363636736,1291523532363636736,2020-08-06 23:56:20,2020-08-06,19:56:20,EDT,228022886,acai_w,Angelo Wijdh,,...,,,,,,"[{'user_id': '228022886', 'username': 'ACAI_W'}]",,,,
1,1291522763233071105,1291451746724741120,2020-08-06 23:53:16,2020-08-06,19:53:16,EDT,1134860643528577026,brittaswenson,Britta Swenson,,...,,,,,,"[{'user_id': '1134860643528577026', 'username'...",,,,
2,1291520828148871168,1291520828148871168,2020-08-06 23:45:35,2020-08-06,19:45:35,EDT,1080245442732974081,sallywo42411402,Sally Wong,,...,,,,,,"[{'user_id': '1080245442732974081', 'username'...",,,,
3,1291520253357326338,1291520253357326336,2020-08-06 23:43:18,2020-08-06,19:43:18,EDT,2431027482,ultimate1us,DENSMORE,,...,,,,,,"[{'user_id': '2431027482', 'username': 'ultima...",,,,
4,1291519814339629059,1291518619197874176,2020-08-06 23:41:33,2020-08-06,19:41:33,EDT,2974733727,taffygeek,Rob Chappell,,...,,,,,,"[{'user_id': '2974733727', 'username': 'taffyg...",,,,


In [10]:
#Using textfeatures library for some text preprocessing
tf.word_count(Master_Tweet_dataset,"tweet",'word_count')
tf.avg_word_length(Master_Tweet_dataset,'tweet','avg_word_length')
tf.stopwords_count(Master_Tweet_dataset,'tweet','stopwords_count')
tf.char_count(Master_Tweet_dataset,'tweet','char_count')
tf.stopwords(Master_Tweet_dataset,'tweet','stopwords')
tf.clean(Master_Tweet_dataset,'tweet','clean_text')
Master_Tweet_dataset.head()

Unnamed: 0,date,username,tweet,replies_count,retweets_count,likes_count,video,geo,positive,death,word_count,avg_word_length,stopwords_count,char_count,stopwords,clean_text
0,2020-08-06,acai_w,"WoW, such a bold and honest statement and brou...",0,0,0,0,,4852143,151483.0,55,4.781818,27,318,"[such, a, and, and, to, you, with, a, of, an, ...",bold honest statement brought integrity please...
1,2020-08-06,brittaswenson,Trump lift the ban so he won’t be to blame for...,0,0,0,0,,4852143,151483.0,43,4.395349,23,231,"[the, so, he, be, to, for, the, and, from, in,...",trump lift wont blame coming bankruptcies majo...
2,2020-08-06,sallywo42411402,"Proves Narvarro right and Dr. Fauci wrong, rig...",0,0,1,0,,4852143,151483.0,46,5.854167,13,331,"[and, was, in, had, with, in, the, of, on, wha...",proves narvarro right fauci wrong right fauci ...
3,2020-08-06,ultimate1us,In 2017 the USA became the victim of a systemi...,0,0,0,1,,4852143,151483.0,53,4.692308,17,296,"[the, the, of, a, in, and, is, very, of, have,...",became victim systemic brain cancer cancer sta...
4,2020-08-06,taffygeek,How is the Dow still rising. Just looking at t...,7,0,1,0,,4852143,151483.0,36,4.583333,15,200,"[is, the, at, that, and, only, of, the, with, ...",still rising looking screen covid deaths covid...


In [11]:
#Removing stopwords and punctuation

clean_tweet = Master_Tweet_dataset['clean_text']
#Tweet Tokenizer 
from nltk.tokenize import TweetTokenizer
ttknz = TweetTokenizer()

#creation of the corpus
#corpus = Master_Tweet_df['clean_tweets'].astype(str)
#corpus.dtypes

#tokenizing corpus
tok_corp = []
for sent in clean_tweet:
    toked = ttknz.tokenize(sent)
    tok_corp.append(toked)

In [12]:
#saving tok list or later use

with open('tok_corp_8_8.pickle','wb') as f:
    pickle.dump(tok_corp,f)

In [13]:
#Sentiment Analysis 
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer, NaiveBayesAnalyzer
#from twitter_nlp_toolkit.tweet_sentiment_classifier import tweet_sentiment_classifier

#tweets = Master_Tweet_df['clean_tweets']

tweets = Master_Tweet_dataset['clean_text']

Sentiment = []
for tweet in tweets:
    #Classifier = tweet_sentiment_classifier.SentimentAnalyzer()
    #sentiment = Classifier.predict_proba(tweet)
    blob = TextBlob(tweet,analyzer=PatternAnalyzer())
    rating = blob.sentiment.polarity
    Sentiment.append(rating)

Master_Tweet_dataset['Sentiment'] = Sentiment
#Master_Tweet_df['Sentiment'] = Master_Tweet_df['Sentiment'].astype(int)
#Master_Tweet_df['Sentiment'].round(decimals = 4)

#Master_Tweet_df['Sentiment'].head()

In [33]:
#Saving as CSV for later uploads to different notebooks
Master_Tweet_dataset.to_csv('data/Master_Tweet_dataset_textprocessing.csv')


0    1579651200000000000
1    1579651200000000000
2    1579651200000000000
3    1579651200000000000
4    1579651200000000000
Name: date, dtype: int64

## Adding Poll Info

In [15]:
#Uploading Poll data
poll_data = pd.read_csv('data/poll_data_dates.csv')
poll_data = poll_data.dropna()

#Converting Date columns to integer so merge will work
Master_Tweet_dataset['date'] = pd.to_datetime(Master_Tweet_dataset['date'])
Master_Tweet_dataset['date'] = Master_Tweet_dataset['date'].astype(int)
poll_data['date'] = pd.to_datetime(poll_data['date'])
poll_data['date'] = poll_data['date'].astype(int)

In [16]:
#Merging with Poll Data

#poll_data = pd.read_csv('data/poll_data_dates.csv')
#pd.to_datetime(poll_data['Date']) #converting to datetime object for merge purposes
#pd.to_datetime(Master_Tweet_df['Date']) #converting to datetime object for merge purposes

left = Master_Tweet_dataset.sort_values(by='date')
right = poll_data.sort_values(by='date')

Master_Tweet_dataset = pd.merge_asof(left,right,on='date',allow_exact_matches=False)

In [32]:
#Master_Tweet_df = Master_Tweet_df.drop('Date')
#Master_Tweet_df = Master_Tweet_df.drop('Date',axis=1)


0    1579651200000000000
1    1579651200000000000
2    1579651200000000000
3    1579651200000000000
4    1579651200000000000
Name: date, dtype: int64

## Creation of Target Column

Victory Spread dataframe is Trump poll figure subtracted from Biden poll figure. If spread is positive, that indicates how much Biden is leading by. Should it be negative, that represents how much Trump is ahead by.

In [18]:
# Calculating Victory Spread
Master_Tweet_dataset['Spread'] = Master_Tweet_dataset['Biden (D)'] - Master_Tweet_dataset['Trump (R)']


In [19]:
#Assiging topics to each tweet  -code inspired by stackabuse
from sklearn.feature_extraction.text import CountVectorizer

#Vectorizing docs
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(clean_tweet)

#fitting LDA Model
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=10, random_state=42)
LDA.fit(doc_term_matrix)

#transforming to get topic numbers
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

#creating column of Topics
Master_Tweet_dataset['Topic'] = topic_values.argmax(axis=1)

In [20]:
# top words for each topic
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['countries', 'death', 'amazon', 'rate', 'country', 'world', 'brazil', 'cases', 'india', 'covid']


Top 10 words for topic #1:
['international', 'medical', 'rate', 'virus', 'pandemic', 'doctors', 'patients', 'economy', 'death', 'covid']


Top 10 words for topic #2:
['second', 'strong', 'france', 'endorsement', 'military', 'total', 'italy', 'spain', 'complete', 'covid']


Top 10 words for topic #3:
['death', 'like', 'need', 'dont', 'country', 'americans', 'trump', 'world', 'people', 'covid']


Top 10 words for topic #4:
['wear', 'mask', 'masks', 'businesses', 'today', 'home', 'stay', 'safe', 'vaccine', 'covid']


Top 10 words for topic #5:
['countries', 'number', 'population', 'world', 'coronavirus', 'trump', 'million', 'cases', 'deaths', 'covid']


Top 10 words for topic #6:
['biden', 'america', 'fake', 'people', 'president', 'news', 'thank', 'covid', 'trump', 'great']


Top 10 words for topic #7:
['doesnt', 'economy', 'pandemic', 'virus', 'distancing', 'wear

In [21]:
#Dictionary to create Target Column
category_dict = {}
for key in [0,4,5,8,1,3]:
    category_dict[key] = 'Bad Response'
for key in [10,6,2,7,9]:
    category_dict[key] = 'Good Response'
    
#Creation of Target Column
Master_Tweet_dataset['Target'] = Master_Tweet_dataset['Topic'].map(category_dict)

In [22]:
Master_Tweet_dataset.head()

Unnamed: 0,date,username,tweet,replies_count,retweets_count,likes_count,video,geo,positive,death,...,Poll,Start Date,End Date,Sample,MoE,Biden (D),Trump (R),Spread,Topic,Target
0,1579651200000000000,realdonaldtrump,Making great progress in @Davos. Tremendous nu...,9465,17624,88225,0,,2,,...,ABC News/Wash PostABC/WP,2020-01-20,2020-01-23,880 RV,4,50.0,46.0,4.0,0,Bad Response
1,1579651200000000000,realdonaldtrump,"Sorry, if you come you will be immediately sen...",8643,24619,98960,0,,2,,...,ABC News/Wash PostABC/WP,2020-01-20,2020-01-23,880 RV,4,50.0,46.0,4.0,3,Bad Response
2,1579651200000000000,realdonaldtrump,See you on Friday...Big Crowd! https://twitter...,7035,24342,97513,0,,2,,...,ABC News/Wash PostABC/WP,2020-01-20,2020-01-23,880 RV,4,50.0,46.0,4.0,0,Bad Response
3,1579651200000000000,realdonaldtrump,True! https://twitter.com/RandPaul/status/1220...,3436,12031,50605,0,,2,,...,ABC News/Wash PostABC/WP,2020-01-20,2020-01-23,880 RV,4,50.0,46.0,4.0,9,Good Response
4,1579651200000000000,realdonaldtrump,“NO PRESSURE”,18086,19899,122408,0,,2,,...,ABC News/Wash PostABC/WP,2020-01-20,2020-01-23,880 RV,4,50.0,46.0,4.0,5,Bad Response


## Final DF Housekeeping

In [23]:
Master_Tweet_dataset.columns

Index(['date', 'username', 'tweet', 'replies_count', 'retweets_count',
       'likes_count', 'video', 'geo', 'positive', 'death', 'word_count',
       'avg_word_length', 'stopwords_count', 'char_count', 'stopwords',
       'clean_text', 'Sentiment', 'Poll', 'Start Date', 'End Date', 'Sample',
       'MoE', 'Biden (D)', 'Trump (R)', 'Spread', 'Topic', 'Target'],
      dtype='object')

In [24]:
EDA_df = Master_Tweet_dataset.copy()
EDA_df.to_csv('data/data_8_08.csv')

In [25]:
#creating dataframe for just text data
#Tweet_df = Master_Tweet_dataset[['date','tweet','stopwords','clean_text']]
#creating dataframe for poll data 
#Poll_df = Master_Tweet_dataset[['Start Date','End Date','Sample','MoE','Poll','Biden (D)', 'Trump (R)']]


# couldn't find any value from these columns
Master_Tweet_dataset = Master_Tweet_dataset.drop(['tweet','Poll',
       'Start Date', 'End Date', 'Sample', 'MoE', 'Biden (D)', 'Trump (R)',
       'Spread'],axis=1)

#converting date column to integer for modeling purposes
#def datetime_to_int(dt):
    #return int(dt.strftime("%Y%m%d"))

#Master_Tweet_dataset['date'] = Master_Tweet_df['date'].apply(lambda x: datetime_to_int(x))



In [31]:
#Making sure datatype is ok for modeling
Master_Tweet_dataset['date']

dtype('int64')

In [27]:
#Checking for null values
Master_Tweet_dataset.isnull().sum()
Master_Tweet_dataset['death'] = Master_Tweet_dataset['death'].fillna(method='backfill')

In [28]:
#Master_Tweet_dataset.to_csv('data/data_8_08.csv')

## Saving to CSV

In [29]:
#saving df to csv for upload in other notebooks
Master_Tweet_dataset.to_csv('data/Master_Tweet_modelready.csv')
#Tweet_df.to_csv('data/Tweet_text_columns.csv')

In [30]:
len(Master_Tweet_dataset['Target'])

17481