In [1]:
#Libraries
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import textfeatures as tf
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer, NaiveBayesAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jamaalsmith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Obtaining Tweet Data

Creating DataFrames based on JSON files created in Twitter_dataqueries notebook

In [2]:
#All Covid tweets
All_Covid_tweets = pd.read_json('tweets/Covid_tweets4.json',lines=True)

#All Trump tweets
Trump_Covid_tweets = pd.read_json('tweets/Trump_Covid_tweets4.json', lines=True)

#All Cuomo tweets
Cuomo_Covid_tweets = pd.read_json('tweets/Cuomo_Covid_tweets4.json',lines=True)

#Baseline Tweets
NYTimes_tweets = pd.read_json('tweets/Nytimes_Covid_tweets4.json',lines=True)
#print( len(NYTimes_tweets))
WashingtonPost_tweets = pd.read_json('tweets/Washpost_tweets4.json',lines=True)
#print( len(Washpost_tweets3.json))

#combining NYTimes and Washington Post to get Baseline Tweets
Baseline_tweets = pd.concat([NYTimes_tweets,WashingtonPost_tweets],axis=0)

#Reformatting Date columns for later merge
All_Covid_tweets['date'] = pd.to_datetime(All_Covid_tweets['date'], format='%Y%m%d')
Trump_Covid_tweets['date'] = pd.to_datetime(Trump_Covid_tweets['date'], format='%Y%m%d')
Cuomo_Covid_tweets['date'] = pd.to_datetime(Cuomo_Covid_tweets['date'], format='%Y%m%d')
Baseline_tweets['date'] = pd.to_datetime(Baseline_tweets['date'], format='%Y%m%d')

In [3]:
#Combining all Tweet DFs into one
Master_Tweet_df = pd.concat([All_Covid_tweets,Trump_Covid_tweets,
                             Cuomo_Covid_tweets,Baseline_tweets],axis=0)

number_of_tweets = len(Master_Tweet_df)
print('The number of tweets in the dataset is:',number_of_tweets)

The number of tweets in the dataset is: 121600


## Obtaining Covid Data

Data for Covid Cases and Deaths was collected from The COVID Tracking Project.

In [4]:
# Covid data set
covid_data = pd.read_csv('https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/us_daily.csv')

# formatting the date column to datetime
covid_data['date'] = pd.to_datetime(covid_data['date'], format='%Y%m%d')

In [5]:
#covid_data = covid_data[['date','positive','death']]
covid_data.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,lastModified,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,2020-08-12,56,5172216,58543332,4174.0,47919.0,340097.0,9559.0,15524.0,2604.0,...,2020-08-12T00:00:00Z,63719722,63715548,63715548,1485,3035,407549,55742,463291,8b0e0ba647da128cf066056e3e1bcf4a94120c8d
1,2020-08-11,56,5116474,58135783,4118.0,48500.0,337062.0,9136.0,15331.0,2415.0,...,2020-08-11T00:00:00Z,63256375,63252257,63252257,1326,2715,683489,55594,739083,4b53c5c61a1b558e1b41cc8e6327f7359c17b4b1
2,2020-08-10,56,5060880,57452294,3966.0,48612.0,334347.0,9216.0,15158.0,2533.0,...,2020-08-10T00:00:00Z,62517140,62513174,62513174,426,1654,674422,41807,716229,80e59c48dcdce8c0fa8760d93a4d1bf0c1a58c35
3,2020-08-09,56,5019073,56777872,3871.0,49048.0,332693.0,9303.0,15081.0,2507.0,...,2020-08-09T00:00:00Z,61800816,61796945,61796945,616,838,661522,51319,712841,83d72910d9f712693eee3f8ca13182a53c81547a
4,2020-08-08,56,4967754,56116350,3888.0,50016.0,331855.0,9652.0,15024.0,2566.0,...,2020-08-08T00:00:00Z,61087992,61084104,61084104,1089,1431,614455,54091,668546,b538b3af9fdc175ec448e61facfd4b323fa32452


### Adding Case/Death Data on Day of the Tweet

In [6]:
#merging the two datasets for all Covid data in one place
Master_Tweet_dataset = pd.merge(Master_Tweet_df,covid_data,on='date')
Master_Tweet_dataset.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,lastModified,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,1288979300402761729,1288973180745453568,2020-07-30 23:26:27,2020-07-30,19:26:27,EDT,826617118833655809,the1triplej,Triple J,,...,2020-07-30T00:00:00Z,54650948,54647001,54647001,1259,3383,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a
1,1288978980435984386,1288880289532407808,2020-07-30 23:25:11,2020-07-30,19:25:11,EDT,1276535008094597121,oglesbykisha,Kisha Sharon Oglesby,,...,2020-07-30T00:00:00Z,54650948,54647001,54647001,1259,3383,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a
2,1288972333919633409,1288815049604292608,2020-07-30 22:58:47,2020-07-30,18:58:47,EDT,1233601848629698561,victors10855858,Little Wolf,,...,2020-07-30T00:00:00Z,54650948,54647001,54647001,1259,3383,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a
3,1288972076808888322,1288935893013876736,2020-07-30 22:57:45,2020-07-30,18:57:45,EDT,1055623358719713280,jerryspiegler,Jerry Spiegler@🏡,,...,2020-07-30T00:00:00Z,54650948,54647001,54647001,1259,3383,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a
4,1288967146123464706,1288907916020461568,2020-07-30 22:38:10,2020-07-30,18:38:10,EDT,340312944,phlphlyest,Joy,,...,2020-07-30T00:00:00Z,54650948,54647001,54647001,1259,3383,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a


### Dropping Unneeded Columns

In [7]:
#dropping unnecessary columns

Master_Tweet_dataset = Master_Tweet_dataset.drop(['cashtags', 'conversation_id', 'hashtags',
       'id','link', 'mentions', 'name', 'near', 'photos',
       'place', 'quote_url','reply_to', 'retweet',
       'retweet_date', 'retweet_id','source', 'time',
       'timezone', 'trans_dest', 'trans_src', 'translate','urls',
       'user_id', 'user_rt', 'user_rt_id',],axis=1)

### SCRUBBING OF TWEETS

In [8]:
#pip install textfeatures
#Getting ride of duplicative column
Master_Tweet_dataset=Master_Tweet_dataset.drop('created_at',axis=1)
Master_Tweet_dataset.columns

Index(['date', 'username', 'tweet', 'replies_count', 'retweets_count',
       'likes_count', 'video', 'geo', 'states', 'positive', 'negative',
       'pending', 'hospitalizedCurrently', 'hospitalizedCumulative',
       'inIcuCurrently', 'inIcuCumulative', 'onVentilatorCurrently',
       'onVentilatorCumulative', 'recovered', 'dateChecked', 'death',
       'hospitalized', 'lastModified', 'total', 'totalTestResults', 'posNeg',
       'deathIncrease', 'hospitalizedIncrease', 'negativeIncrease',
       'positiveIncrease', 'totalTestResultsIncrease', 'hash'],
      dtype='object')

In [9]:
#Using textfeatures library text preprocessing
tf.word_count(Master_Tweet_dataset,"tweet",'word_count')
tf.avg_word_length(Master_Tweet_dataset,'tweet','avg_word_length')
tf.stopwords_count(Master_Tweet_dataset,'tweet','stopwords_count')
tf.char_count(Master_Tweet_dataset,'tweet','char_count')
tf.stopwords(Master_Tweet_dataset,'tweet','stopwords')
tf.clean(Master_Tweet_dataset,'tweet','clean_text')
Master_Tweet_dataset.head()

Unnamed: 0,date,username,tweet,replies_count,retweets_count,likes_count,video,geo,states,positive,...,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash,word_count,avg_word_length,stopwords_count,char_count,stopwords,clean_text
0,2020-07-30,the1triplej,You sure have; America 1st in the world for to...,0,0,1,0,,56,4467852,...,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a,36,4.638889,13,202,"[in, the, for, between, or, and, the, will, be...",sure america world total covid cases making fa...
1,2020-07-30,oglesbykisha,Watching Fox news on the State of the 2020 rac...,0,0,0,0,,56,4467852,...,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a,48,4.734694,22,280,"[on, the, of, the, the, why, they, for, the, o...",watching news state race amid covidask voters ...
2,2020-07-30,victors10855858,Do you mean Covid-19 or perhaps SARS-COV-2? Be...,0,0,0,0,,56,4467852,...,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a,48,4.666667,22,271,"[you, or, is, and, if, we, to, a, we, should, ...",mean covid perhaps sarscov china virus racist ...
3,2020-07-30,jerryspiegler,You think the Russians or the Chinese are just...,0,1,7,0,,56,4467852,...,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a,41,4.560976,17,227,"[the, or, the, are, just, with, to, the, don't...",think russians chinese waiting baited breath a...
4,2020-07-30,phlphlyest,He did not die of cancer. He was in remission....,0,0,5,0,,56,4467852,...,736388,69466,805854,7be08ec9befba2afef926c3c2d79da3e1fe5022a,43,4.72093,17,245,"[did, not, of, was, in, had, a, and, an, or, a...",cancer remission compromised immune system att...


In [10]:
#Tokenizing the Corpus

clean_tweet = Master_Tweet_dataset['clean_text']
#Tweet Tokenizer 

ttknz = TweetTokenizer()

#creation of the corpus
#corpus = Master_Tweet_df['clean_tweets'].astype(str)
#corpus.dtypes

#tokenizing corpus
tok_corp = []
for sent in clean_tweet:
    toked = ttknz.tokenize(sent)
    tok_corp.append(toked)

In [11]:
#saving tok list or later use

with open('tok_corp_8_8.pickle','wb') as f:
    pickle.dump(tok_corp,f)

In [12]:
#Sentiment Analysis - adding Sentiment rating to each tweet

#from twitter_nlp_toolkit.tweet_sentiment_classifier import tweet_sentiment_classifier

#tweets = Master_Tweet_df['clean_tweets']

tweets = Master_Tweet_dataset['clean_text']

Sentiment = []
for tweet in tweets:
    #Classifier = tweet_sentiment_classifier.SentimentAnalyzer()
    #sentiment = Classifier.predict_proba(tweet)
    blob = TextBlob(tweet,analyzer=PatternAnalyzer())
    rating = blob.sentiment.polarity
    Sentiment.append(rating)

Master_Tweet_dataset['Sentiment'] = Sentiment
#Master_Tweet_df['Sentiment'] = Master_Tweet_df['Sentiment'].astype(int)
#Master_Tweet_df['Sentiment'].round(decimals = 4)

#Master_Tweet_df['Sentiment'].head()

In [13]:
#Saving as CSV for later uploads to different notebooks
Master_Tweet_dataset.to_csv('data/Master_Tweet_dataset_textprocessing.csv')


## Adding Poll Info

In [14]:
#Uploading Poll data
poll_data = pd.read_csv('data/poll_data_dates.csv')
poll_data = poll_data.dropna()

#Converting Date columns to integer so merge will work
Master_Tweet_dataset['date'] = pd.to_datetime(Master_Tweet_dataset['date'])
Master_Tweet_dataset['date'] = Master_Tweet_dataset['date'].astype(int)
poll_data['date'] = pd.to_datetime(poll_data['date'])
poll_data['date'] = poll_data['date'].astype(int)

In [15]:
#Merging with Poll Data

#poll_data = pd.read_csv('data/poll_data_dates.csv')
#pd.to_datetime(poll_data['Date']) #converting to datetime object for merge purposes
#pd.to_datetime(Master_Tweet_df['Date']) #converting to datetime object for merge purposes

left = Master_Tweet_dataset.sort_values(by='date')
right = poll_data.sort_values(by='date')

Master_Tweet_dataset = pd.merge_asof(left,right,on='date',allow_exact_matches=False)

In [16]:
#Master_Tweet_df = Master_Tweet_df.drop('Date')
#Master_Tweet_df = Master_Tweet_df.drop('Date',axis=1)


## Creation of Target Column

For the purpose of this project, our goal is to see if we can classify how individuals view the response to the Covid pandemic in the United States. The below cells group the tweets into 10 topic with use of a Latent Dirichlet Allocation model. 

In [17]:
# Calculating Victory Spread
Master_Tweet_dataset['Spread'] = Master_Tweet_dataset['Biden (D)'] - Master_Tweet_dataset['Trump (R)']


In [18]:
#Assiging topics to each tweet  -code inspired by stackabuse


#Vectorizing docs
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(clean_tweet)

#fitting LDA Model

LDA = LatentDirichletAllocation(n_components=10, random_state=42)
LDA.fit(doc_term_matrix)

#transforming to get topic numbers
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

#creating column of Topics
Master_Tweet_dataset['Topic'] = topic_values.argmax(axis=1)

KeyboardInterrupt: 

In [None]:
# top words for each topic
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])

In [None]:
#Dictionary to create Target Column
category_dict = {}
for key in [0,4,5,8,1,3]:
    category_dict[key] = 'Bad Response'
for key in [10,6,2,7,9]:
    category_dict[key] = 'Good Response'
    
#Creation of Target Column
Master_Tweet_dataset['Target'] = Master_Tweet_dataset['Topic'].map(category_dict)

## Final DF Housekeeping

In [None]:
Master_Tweet_dataset.columns

In [None]:
EDA_df = Master_Tweet_dataset.copy()
EDA_df.to_csv('data/data_8_08.csv')

In [None]:
#creating dataframe for just text data
#Tweet_df = Master_Tweet_dataset[['date','tweet','stopwords','clean_text']]
#creating dataframe for poll data 
#Poll_df = Master_Tweet_dataset[['Start Date','End Date','Sample','MoE','Poll','Biden (D)', 'Trump (R)']]


# couldn't find any value from these columns
Master_Tweet_dataset = Master_Tweet_dataset.drop(['tweet','Poll',
       'Start Date', 'End Date', 'Sample', 'MoE',
       ],axis=1)

#converting date column to integer for modeling purposes
#def datetime_to_int(dt):
    #return int(dt.strftime("%Y%m%d"))

#Master_Tweet_dataset['date'] = Master_Tweet_df['date'].apply(lambda x: datetime_to_int(x))



In [None]:
#Making sure date is ok for modeling
Master_Tweet_dataset['date']

In [None]:
#Checking for null values
Master_Tweet_dataset.isnull().sum()
Master_Tweet_dataset['death'] = Master_Tweet_dataset['death'].fillna(method='backfill')

In [None]:
#Master_Tweet_dataset.to_csv('data/data_8_08.csv')

## Saving to CSV

In [None]:
#saving df to csv for upload in other notebooks
Master_Tweet_dataset.to_csv('data/Master_Tweet_modelready.csv')
#Tweet_df.to_csv('data/Tweet_text_columns.csv')

In [None]:
len(Master_Tweet_dataset['Target'])