In [40]:
#Libraries
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import textfeatures as tf
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer, NaiveBayesAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Obtaining Tweet Data

Creating DataFrames based on JSON files created in Twitter_dataqueries notebook

In [41]:
#All Covid tweets
All_Covid_tweets = pd.read_json('tweets/Covid_tweets5.json',lines=True)

#All Trump tweets
Trump_Covid_tweets = pd.read_json('tweets/Trump_Covid_tweets4.json', lines=True)

#All Cuomo tweets
Cuomo_Covid_tweets = pd.read_json('tweets/Cuomo_Covid_tweets5.json',lines=True)

#Baseline Tweets
NYTimes_tweets = pd.read_json('tweets/Nytimes_Covid_tweets5.json',lines=True)
#print( len(NYTimes_tweets))
WashingtonPost_tweets = pd.read_json('tweets/Washpost_tweets5.json',lines=True)
#print( len(Washpost_tweets3.json))

#combining NYTimes and Washington Post to get Baseline Tweets
Baseline_tweets = pd.concat([NYTimes_tweets,WashingtonPost_tweets],axis=0)

#Reformatting Date columns for later merge
All_Covid_tweets['date'] = pd.to_datetime(All_Covid_tweets['date'], format='%Y%m%d')
Trump_Covid_tweets['date'] = pd.to_datetime(Trump_Covid_tweets['date'], format='%Y%m%d')
Cuomo_Covid_tweets['date'] = pd.to_datetime(Cuomo_Covid_tweets['date'], format='%Y%m%d')
Baseline_tweets['date'] = pd.to_datetime(Baseline_tweets['date'], format='%Y%m%d')

In [42]:
#Combining all Tweet DFs into one
Master_Tweet_df = pd.concat([All_Covid_tweets,Trump_Covid_tweets,
                             Cuomo_Covid_tweets,Baseline_tweets],axis=0)

number_of_tweets = len(Master_Tweet_df)
print('The number of tweets in the dataset is:',number_of_tweets)

The number of tweets in the dataset is: 24681


## Obtaining Covid Data

Data for Covid Cases and Deaths was collected from The COVID Tracking Project.

In [43]:
# Covid data set
covid_data = pd.read_csv('data/daily.8.22.csv')

# formatting the date column to datetime
covid_data['date'] = pd.to_datetime(covid_data['date'], format='%Y%m%d')

In [44]:
#covid_data = covid_data[['date','positive','death']]
covid_data.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,lastModified,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,2020-08-22,56,5639613,65442552,4154.0,39905.0,358613.0,8218.0,16657.0,2205.0,...,2020-08-22T00:00:00Z,71086319,71082165,71082165,1024,1520,699089,46295,745384,10f85ba760e3a61eb34ab55eb1ccc3ec21eed56f
1,2020-08-21,56,5593318,64743463,4199.0,40951.0,357093.0,8349.0,16563.0,2286.0,...,2020-08-21T00:00:00Z,70340980,70336781,70336781,1123,1817,685099,46821,731920,cd07637ded4b265fc45cb2c204784d5cb8c70085
2,2020-08-20,56,5546497,64058364,4170.0,41988.0,355276.0,8483.0,16487.0,2335.0,...,2020-08-20T00:00:00Z,69609031,69604861,69604861,1134,2010,621696,43740,665436,5e7ec5926f868e83e4fd901d5b6b7e3c0c9f161b
3,2020-08-19,56,5502757,63436668,4374.0,43330.0,353266.0,8744.0,16377.0,2371.0,...,2020-08-19T00:00:00Z,68943799,68939425,68939425,1420,2032,630559,44933,675492,64c8b61e3c52baa7b1bcb0a6bbaa8dee87f3e71c
4,2020-08-18,56,5457824,62806109,4412.0,43747.0,351234.0,8866.0,16123.0,2468.0,...,2020-08-18T00:00:00Z,68268345,68263933,68263933,1195,2273,602356,40458,642814,e41905ca050e7958f710b87063f436f7e8f1a9aa


### Adding Case/Death Data on Day of the Tweet

In [45]:
#merging the two datasets for all Covid data in one place
Master_Tweet_dataset = pd.merge(Master_Tweet_df,covid_data,on='date')
Master_Tweet_dataset.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,lastModified,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,1228397478954795009,1228397478954795008,2020-02-14 19:15:56,2020-02-14,14:15:56,EDT,143132365,nycemswatch,NYC EMS Watch,,...,2020-02-14T00:00:00Z,21,21,21,0,0,0,3,3,28ded527706588122c2398a311e818b7473f77c9
1,1228397478954795009,1228397478954795008,2020-02-14 19:15:56,2020-02-14,14:15:56,EDT,143132365,nycemswatch,NYC EMS Watch,,...,2020-02-14T00:00:00Z,21,21,21,0,0,0,3,3,28ded527706588122c2398a311e818b7473f77c9
2,1228397478954795009,1228397478954795008,2020-02-14 19:15:56,2020-02-14,14:15:56,EDT,143132365,nycemswatch,NYC EMS Watch,,...,2020-02-14T00:00:00Z,21,21,21,0,0,0,3,3,28ded527706588122c2398a311e818b7473f77c9
3,1231683951896383488,1231683951896383488,2020-02-23 20:55:12,2020-02-23,15:55:12,EDT,44984619,bobfoolery,Bob Cooper,,...,2020-02-23T00:00:00Z,122,122,122,0,0,0,16,16,bbd18b10d469b0a14ea8b7eb24edf43527c8fbb7
4,1231683951896383488,1231683951896383488,2020-02-23 20:55:12,2020-02-23,15:55:12,EDT,44984619,bobfoolery,Bob Cooper,,...,2020-02-23T00:00:00Z,122,122,122,0,0,0,16,16,bbd18b10d469b0a14ea8b7eb24edf43527c8fbb7


### Dropping Unneeded Columns

In [46]:
#dropping unnecessary columns

Master_Tweet_dataset = Master_Tweet_dataset.drop(['cashtags', 'conversation_id', 'hashtags',
       'id','link', 'mentions', 'name', 'near', 'photos',
       'place', 'quote_url','reply_to', 'retweet',
       'retweet_date', 'retweet_id','source', 'time',
       'timezone', 'trans_dest', 'trans_src', 'translate','urls',
       'user_id', 'user_rt', 'user_rt_id',],axis=1)

### SCRUBBING OF TWEETS

In [47]:
#pip install textfeatures
#Getting ride of duplicative column
Master_Tweet_dataset=Master_Tweet_dataset.drop('created_at',axis=1)
Master_Tweet_dataset.columns

Index(['date', 'username', 'tweet', 'replies_count', 'retweets_count',
       'likes_count', 'video', 'geo', 'states', 'positive', 'negative',
       'pending', 'hospitalizedCurrently', 'hospitalizedCumulative',
       'inIcuCurrently', 'inIcuCumulative', 'onVentilatorCurrently',
       'onVentilatorCumulative', 'recovered', 'dateChecked', 'death',
       'hospitalized', 'lastModified', 'total', 'totalTestResults', 'posNeg',
       'deathIncrease', 'hospitalizedIncrease', 'negativeIncrease',
       'positiveIncrease', 'totalTestResultsIncrease', 'hash'],
      dtype='object')

In [48]:
#Using textfeatures library text preprocessing
tf.word_count(Master_Tweet_dataset,"tweet",'word_count')
tf.avg_word_length(Master_Tweet_dataset,'tweet','avg_word_length')
tf.stopwords_count(Master_Tweet_dataset,'tweet','stopwords_count')
tf.char_count(Master_Tweet_dataset,'tweet','char_count')
tf.stopwords(Master_Tweet_dataset,'tweet','stopwords')
tf.clean(Master_Tweet_dataset,'tweet','clean_text')
Master_Tweet_dataset.head()

Unnamed: 0,date,username,tweet,replies_count,retweets_count,likes_count,video,geo,states,positive,...,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash,word_count,avg_word_length,stopwords_count,char_count,stopwords,clean_text
0,2020-02-14,nycemswatch,Everything you need to know about AMR right he...,3,12,15,0,,2,21,...,0,3,3,28ded527706588122c2398a311e818b7473f77c9,37,9.780488,12,442,"[you, to, about, who, for, have, after, a, wit...",everything need know right emts work american ...
1,2020-02-14,nycemswatch,Everything you need to know about AMR right he...,3,12,15,0,,2,21,...,0,3,3,28ded527706588122c2398a311e818b7473f77c9,37,9.780488,12,442,"[you, to, about, who, for, have, after, a, wit...",everything need know right emts work american ...
2,2020-02-14,nycemswatch,Everything you need to know about AMR right he...,3,12,15,0,,2,21,...,0,3,3,28ded527706588122c2398a311e818b7473f77c9,37,9.780488,12,442,"[you, to, about, who, for, have, after, a, wit...",everything need know right emts work american ...
3,2020-02-23,bobfoolery,If/when COVID-19 becomes epidemic in the Unite...,0,0,0,0,,2,122,...,0,16,16,bbd18b10d469b0a14ea8b7eb24edf43527c8fbb7,46,5.021739,15,276,"[in, the, will, down, as, has, been, in, and, ...",ifwhen covid becomes epidemic united states am...
4,2020-02-23,bobfoolery,If/when COVID-19 becomes epidemic in the Unite...,0,0,0,0,,2,122,...,0,16,16,bbd18b10d469b0a14ea8b7eb24edf43527c8fbb7,46,5.021739,15,276,"[in, the, will, down, as, has, been, in, and, ...",ifwhen covid becomes epidemic united states am...


In [49]:
#Tokenizing the Corpus

clean_tweet = Master_Tweet_dataset['clean_text']
#Tweet Tokenizer 

ttknz = TweetTokenizer()

#creation of the corpus
#corpus = Master_Tweet_df['clean_tweets'].astype(str)
#corpus.dtypes

#tokenizing corpus
tok_corp = []
for sent in clean_tweet:
    toked = ttknz.tokenize(sent)
    tok_corp.append(toked)

In [50]:
#saving tok list or later use

with open('tok_corp_8_8.pickle','wb') as f:
    pickle.dump(tok_corp,f)

In [51]:
#Sentiment Analysis - adding Sentiment rating to each tweet

#from twitter_nlp_toolkit.tweet_sentiment_classifier import tweet_sentiment_classifier

#tweets = Master_Tweet_df['clean_tweets']

tweets = Master_Tweet_dataset['clean_text']

Sentiment = []
for tweet in tweets:
    #Classifier = tweet_sentiment_classifier.SentimentAnalyzer()
    #sentiment = Classifier.predict_proba(tweet)
    blob = TextBlob(tweet,analyzer=PatternAnalyzer())
    rating = blob.sentiment.polarity
    Sentiment.append(rating)

Master_Tweet_dataset['Sentiment'] = Sentiment
#Master_Tweet_df['Sentiment'] = Master_Tweet_df['Sentiment'].astype(int)
#Master_Tweet_df['Sentiment'].round(decimals = 4)

#Master_Tweet_df['Sentiment'].head()

In [52]:
#Saving as CSV for later uploads to different notebooks
Master_Tweet_dataset.to_csv('data/Master_Tweet_dataset_textprocessing.csv')


## Adding Poll Info

In [53]:
#Uploading Poll data
poll_data = pd.read_csv('data/poll_data_dates.csv')
poll_data = poll_data.dropna()

#Converting Date columns to integer so merge will work
Master_Tweet_dataset['date'] = pd.to_datetime(Master_Tweet_dataset['date'])
Master_Tweet_dataset['date'] = Master_Tweet_dataset['date'].astype(int)
poll_data['date'] = pd.to_datetime(poll_data['date'])
poll_data['date'] = poll_data['date'].astype(int)

In [54]:
#Merging with Poll Data

#poll_data = pd.read_csv('data/poll_data_dates.csv')
#pd.to_datetime(poll_data['Date']) #converting to datetime object for merge purposes
#pd.to_datetime(Master_Tweet_df['Date']) #converting to datetime object for merge purposes

left = Master_Tweet_dataset.sort_values(by='date')
right = poll_data.sort_values(by='date')

Master_Tweet_dataset = pd.merge_asof(left,right,on='date',allow_exact_matches=False)

In [55]:
#Master_Tweet_df = Master_Tweet_df.drop('Date')
#Master_Tweet_df = Master_Tweet_df.drop('Date',axis=1)


## Creation of Target Column

For the purpose of this project, our goal is to see if we can classify how individuals view the response to the Covid pandemic in the United States. The below cells group the tweets into 10 topic with use of a Latent Dirichlet Allocation model. 

In [56]:
# Calculating Victory Spread
Master_Tweet_dataset['Spread'] = Master_Tweet_dataset['Biden (D)'] - Master_Tweet_dataset['Trump (R)']


In [77]:
#Assiging topics to each tweet  -code inspired by stackabuse


#Vectorizing docs
#count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
#doc_term_matrix = count_vect.fit_transform(clean_tweet)

#fitting LDA Model

#LDA = LatentDirichletAllocation(n_components=10, random_state=42)
tweet_lda_model = pickle.load(open('LDA_model.pickle','rb'))

#saving LDA Model
#pickle.dump(LDA_model_8_22, open( 'LDA_model_8_22.pickle', 'wb') )


#transforming to get topic numbers
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

#creating column of Topics
Master_Tweet_dataset['Topic'] = topic_values.argmax(axis=1)

In [78]:
#Getting topics
num_topics = 10
for i,topic in tweet_lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

0: 0.030*"great" + 0.020*"complete" + 0.019*"total" + 0.018*"endorsement" + 0.017*"russia" + 0.017*"strong" + 0.016*"military" + 0.016*"south" + 0.015*"india" + 0.013*"need"

1: 0.023*"great" + 0.014*"border" + 0.011*"foxnews" + 0.009*"canada" + 0.009*"york" + 0.008*"testing" + 0.008*"today" + 0.008*"closed" + 0.007*"keep" + 0.007*"borders"

2: 0.034*"trump" + 0.017*"president" + 0.016*"people" + 0.015*"great" + 0.010*"would" + 0.010*"americans" + 0.009*"nothing" + 0.009*"america" + 0.009*"good" + 0.008*"like"

3: 0.036*"news" + 0.032*"fake" + 0.017*"world" + 0.016*"media" + 0.014*"country" + 0.014*"people" + 0.014*"like" + 0.012*"dont" + 0.009*"please" + 0.008*"trump"

4: 0.029*"countries" + 0.023*"death" + 0.020*"rate" + 0.009*"italy" + 0.009*"brazil" + 0.009*"virus" + 0.009*"deaths" + 0.009*"perspective" + 0.009*"much" + 0.008*"germany"

5: 0.033*"thank" + 0.020*"house" + 0.017*"white" + 0.014*"people" + 0.012*"coronavirus" + 0.011*"party" + 0.010*"workers" + 0.010*"country" + 0.010

In [73]:
#Dictionary to create Target Column
category_dict = {}
for key in [0,4,5,8,1,3]:
    category_dict[key] = 'Bad Response'
for key in [10,6,2,7,9]:
    category_dict[key] = 'Good Response'
    
#Creation of Target Column
Master_Tweet_dataset['Target'] = Master_Tweet_dataset['Topic'].map(category_dict)

## Final DF Housekeeping

In [60]:
Master_Tweet_dataset.columns

Index(['date', 'username', 'tweet', 'replies_count', 'retweets_count',
       'likes_count', 'video', 'geo', 'states', 'positive', 'negative',
       'pending', 'hospitalizedCurrently', 'hospitalizedCumulative',
       'inIcuCurrently', 'inIcuCumulative', 'onVentilatorCurrently',
       'onVentilatorCumulative', 'recovered', 'dateChecked', 'death',
       'hospitalized', 'lastModified', 'total', 'totalTestResults', 'posNeg',
       'deathIncrease', 'hospitalizedIncrease', 'negativeIncrease',
       'positiveIncrease', 'totalTestResultsIncrease', 'hash', 'word_count',
       'avg_word_length', 'stopwords_count', 'char_count', 'stopwords',
       'clean_text', 'Sentiment', 'Poll', 'Start Date', 'End Date', 'Sample',
       'MoE', 'Biden (D)', 'Trump (R)', 'Spread', 'Topic', 'Target'],
      dtype='object')

In [61]:
EDA_df = Master_Tweet_dataset.copy()
EDA_df.to_csv('data/data_8_08.csv')

In [62]:
#creating dataframe for just text data
#Tweet_df = Master_Tweet_dataset[['date','tweet','stopwords','clean_text']]
#creating dataframe for poll data 
#Poll_df = Master_Tweet_dataset[['Start Date','End Date','Sample','MoE','Poll','Biden (D)', 'Trump (R)']]


# couldn't find any value from these columns
Master_Tweet_dataset = Master_Tweet_dataset.drop(['tweet','Poll',
       'Start Date', 'End Date', 'Sample', 'MoE',
       ],axis=1)

#converting date column to integer for modeling purposes
#def datetime_to_int(dt):
    #return int(dt.strftime("%Y%m%d"))

#Master_Tweet_dataset['date'] = Master_Tweet_df['date'].apply(lambda x: datetime_to_int(x))



In [63]:
#Making sure date is ok for modeling
Master_Tweet_dataset['date']

0        1580601600000000000
1        1580601600000000000
2        1580688000000000000
3        1580688000000000000
4        1581638400000000000
                ...         
24676    1597881600000000000
24677    1597881600000000000
24678    1597881600000000000
24679    1597881600000000000
24680    1597881600000000000
Name: date, Length: 24681, dtype: int64

In [64]:
#Checking for null values
Master_Tweet_dataset.isnull().sum()
Master_Tweet_dataset['death'] = Master_Tweet_dataset['death'].fillna(method='backfill')

In [65]:
#Master_Tweet_dataset.to_csv('data/data_8_08.csv')

## Saving to CSV

In [66]:
#saving df to csv for upload in other notebooks
Master_Tweet_dataset.to_csv('data/Master_Tweet_modelready.csv')
#Tweet_df.to_csv('data/Tweet_text_columns.csv')

In [67]:
len(Master_Tweet_dataset['Target'])

24681