In [44]:
#!pip install -U tweet-preprocessor
#!pip install -U vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 15.1 MB/s eta 0:00:01
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.1


In [46]:
import pandas as pd

# text preprocessing
import preprocessor as p
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import re

# sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [41]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [47]:
lemmatizer = WordNetLemmatizer()
analyser = SentimentIntensityAnalyzer()

## Stop Words

In [48]:
stop_words = stopwords.words('english')
stop_words.extend(['coronavirus', 'covid', 'covidー', 'coronavirusoutbreak', 'coronaviruspandemic'])
stop_words = set(stop_words)

## Clean Text

In [39]:
def cleanText(text):
    #removes URLs, @Mentions, Emojis, Smileys
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
    preprocesser = lambda x: p.clean(x) 
    
    # prevent acronym for United States from losing meaning
    expand_us = lambda x: x.replace('U.S.', 'United States')
    
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    remove_punctuation = lambda x: x.translate(table)
    
    # remove numbers
    text_nonum = lambda x: re.sub(r'\d+', '', x)
    
    # convert all letters to lowercase
    text_lower = lambda x: x.lower()

    # substitute multiple spaces with single space
    text_nospaces = lambda x: re.sub(r'\s+', ' ', x, flags=re.I)

    # remove all single characters
    text_single = lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x)
    
    # tokenize words
    tokenize = lambda x: word_tokenize(x)
    remove_stop = lambda x: [w for w in x if w not in stop_words]
    lemmatize_tweet = lambda x: [lemmatizer.lemmatize(word) for word in x]
    create_string = lambda x: ' '.join(x)
    
    for function in [preprocesser, expand_us, remove_punctuation, 
                     text_nonum, text_lower, text_nospaces, 
                     text_single,
                     tokenize, remove_stop, 
                     lemmatize_tweet, 
                     create_string
                    ]:
        text = text.map(function)
    return text

In [42]:
df['processed_text'] = cleanText(df['full_text'])

## Sentiment
Following the typical threshold values as cited by the VADER documentation
* positive sentiment: compound score >= 0.05
* neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
* negative sentiment: compound score <= -0.05

In [51]:
df['sentiment_distribution'] = df['full_text'].apply(lambda x: analyser.polarity_scores(x))

In [56]:
df['sentiment'] = df['sentiment_distribution'].apply(lambda x: x['compound'])

In [62]:
state_list = ['california', 'colorado', 'florida', 
              'georgia', 'idaho', 'illinois', 
              'louisiana', 'massachusetts', 'newyork',
              'tennessee', 'texas', 'washington']

In [65]:
for state in state_list:
    df = pd.read_csv(f'tweets/tweets_updated/coronavirus_{state}_tweets_updated', engine = 'python')
    df['processed_text'] = cleanText(df['full_text'])
    df['sentiment_distribution'] = df['full_text'].apply(lambda x: analyser.polarity_scores(x))
    df['sentiment'] = df['sentiment_distribution'].apply(lambda x: x['compound'])
    df.to_csv(f'tweets/tweets_processed/coronavirus_{state}_tweets_processed', index=False)
    print(f'Finished processing and analyzing sentiment for {state}!')

Finished processing and analyzing sentiment for newyork!
