### Imports

In [35]:
# general
import pandas as pd
import numpy as np
import re

# plotting
import plotly.express as px

# twitter scraping
import snscrape.modules.twitter as sntwitter

# RoBERTa
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# stats
from statsmodels.stats.proportion import proportions_ztest

### Twitter scraping

In [36]:
query = '"prop 30"  until:2022-11-08 since:2022-07-01' # twitter search query
tweets = [] # empty list to store tweet data
limit = 5000 # int to limit the amount of tweets to be scrapped

In [37]:
for tweet in sntwitter.TwitterSearchScraper(query).get_items(): # iterate over every scrapped tweet
    if len(tweets) == limit: # if meet limit, break loop
        break
    # append desired tweet data as a list into tweets list
    tweet_data = [tweet.date, tweet.user.username, tweet.content, tweet.likeCount, tweet.retweetCount]
    tweets.append(tweet_data)

In [38]:
# convert stored tweets into a dataframe
tweets_df = pd.DataFrame(tweets, columns = ['Date', 'User', 'Content', 'Likes', 'Retweets'])

In [39]:
tweets_df

Unnamed: 0,Date,User,Content,Likes,Retweets
0,2022-11-07 23:01:03+00:00,YousefBaig,"Newsom claimed Prop. 30 ""puts corporate welfar...",1,4
1,2022-11-07 22:59:20+00:00,ClimateResolve,#Prop30 is pivotal to help our state combat da...,5,6
2,2022-11-07 22:56:18+00:00,trader_mtg,@ecommerceshares Wait until Prop 30 gets shot ...,1,0
3,2022-11-07 22:47:35+00:00,LByock,@dhere I voted no on 27 (which is in conflict ...,1,1
4,2022-11-07 22:39:45+00:00,SFBayPSR,"VOTE YES on Prop 30! If we don’t act, Californ...",10,7
...,...,...,...,...,...
2607,2022-07-02 01:01:34+00:00,alfred_twu,Prop 30: Electric Vehicles &amp; Wildfire Prev...,23,3
2608,2022-07-01 22:57:49+00:00,wickedmitch_,Tentative votes:\nProp 1: yes\nProp 26: yes\nP...,0,0
2609,2022-07-01 21:49:09+00:00,davidaguilar92,California's Secretary of State just released ...,0,0
2610,2022-07-01 20:22:45+00:00,RL_Miller,"Prop 1: OH HELL YES\nProp 26: Maybe, but not w...",1,0


### Data Preprocessing Function

In [26]:
def preprocess(text: str) -> str:
    '''
    Performs basic preprocessing on text to meet specifications of roBERTa model.
    References to other users ('@johnsmith123') will be replaced with '@user'.
    Any links ('https://www.google.com/') will be replaced with 'http'
    '''
    elements = [] # empty list to store tweet elements
    for element in text.split(' '): # split tweet on spaces
        if element.startswith('@') and len(element) > 1: # replace @ mentions with @user
            element = '@user'
        elif element.startswith('http'): # replace links with hhtp
            element = 'hhtp'
        elements.append(element)
    return ' '.join(elements) # join together all elements with spaces inbetween

### Load in NLTK's [VADER](https://www.nltk.org/_modules/nltk/sentiment/vader.html) sentiment analyzer

In [24]:
nltk_sia = SentimentIntensityAnalyzer() # load in VADER model

### Load in [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta) model from [Hugging Face](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=Covid+cases+are+increasing+fast%21)

In [40]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest" # link to mmodel
tokenizer = AutoTokenizer.from_pretrained(MODEL) # load in model tokenizer
model = AutoModelForSequenceClassification.from_pretrained(MODEL) # load in model

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Perform sentiment analysis with VADER and RoBERTa

In [41]:
def roberta(text: str):
    text = preprocess(text) # preprocess text
    encoded_input = tokenizer(text, return_tensors='pt') # tokenize text and create neurons
    output = model(**encoded_input) # run model on neurons
    scores = output[0][0].detach().numpy() # extract the scores of the model
    scores = softmax(scores) # use softmax to convert model scores into probabilities
    key_names = ['Negative', 'Neutral', 'Positive'] # label list
    scores_dict = dict(zip(key_names, scores)) # combine model probabilities and labels into one dict
    
    scores_dict['Method'] = 'RoBERTa' # add name of model into dict
    return scores_dict

In [32]:
def vader(text: str):
    text = preprocess(text) # preprocess text
    scores = nltk_sia.polarity_scores(text) # run model on processed text
    compound = scores.pop('compound') # remove 'compound' score
    scores = list(scores.values()) # convert scores into a list

    key_names = ['Negative', 'Neutral', 'Positive'] # label list
    scores_dict = dict(zip(key_names, scores)) # combine labels and list into a dict
    scores_dict['Method'] = 'VADER' # add name of model into dict
    scores_dict['Compound'] = compound # add compound score back
    
    return scores_dict

In [53]:
def get_sentiment_vader(scores):
    compound = scores.pop('Compound') # remove compound score

    # determinesentiment based on compoun score
    if compound >= 0.05:
        sentiment = 'Positive'
    elif compound <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    
    return sentiment

In [43]:
def get_sentiment_roberta(scores):
    method = scores.pop('Method') # remove model name for ease of use
    sentiment = max(scores, key = scores.get) # get label of max probability
    scores['Method'] = method # add model name back into dict
    return sentiment # return the label of the max probabilitiy -> sentiment

In [54]:
def sentiment_analysis(text:str, method:str):
    '''
    First checks if tweet contains various phrases that result in a trivial sentiment determination.
    Otherwise, performs roBERTa sentiment analysis.
    Returns a dictionary containing the probabilities of each sentiment and the sentiment with the greatest probability.
    For our analysis, Yes = 'Positive' and No = 'Negative'.
    '''
    
    positive_regex = 'yeson.{0,5}(prop)?.?30|prop.?30.?yes' # regex expression to check for positive sentiment
    negative_regex = 'noon.{0,5}(prop)?.?30|prop.?30.?no' # regex expression to check for negative sentiment

    lower_text = text.lower().replace(' ','') # lowercase text and remove spaces for regex expressions
    # contains positive phrases
    if bool(re.search(positive_regex, lower_text)):
        sentiment_scores = {'Negative': 0, 'Neutral': 0, 'Positive': 1, 'Sentiment': 'Positive', 'Method': method}
    # contains negative phrases
    elif bool(re.search(negative_regex, lower_text)):
        sentiment_scores = {'Negative': 1, 'Neutral': 0, 'Positive': 0, 'Sentiment': 'Negative', 'Method': method}
    else:
        # use RoBERTa model
        if method == 'RoBERTa':
            sentiment_scores = roberta(text)
            sentiment = get_sentiment_roberta(sentiment_scores) # get sentiment
            sentiment_scores['Sentiment'] = sentiment # add sentiment to result
        # use RoBERTa model
        elif method == 'VADER':
            sentiment_scores = vader(text)
            sentiment = get_sentiment_vader(sentiment_scores)
            sentiment_scores['Sentiment'] = sentiment
        else:
            return f'unknown model: {method}'
          
    
    return sentiment_scores

#### Run sentiment analysis on all tweets

In [55]:
results_r = [] # list to store RoBERTa results
results_v = [] # list to store VADER results
# iterate over every tweet
for index, row in tweets_df.iterrows():
    sentiment_scores_roberta = sentiment_analysis(row['Content'], 'RoBERTa') # use RoBERTa model
    sentiment_scores_vader = sentiment_analysis(row['Content'], 'VADER') # use VADER model
    results_r.append(sentiment_scores_roberta) # add RoBERTa results to corresponding list
    results_v.append(sentiment_scores_vader) # add VADER results to corresponding list
sentiment_r_df = pd.DataFrame(results_r) # turn RoBERTa results into dataframe
sentiment_v_df = pd.DataFrame(results_v) # turn VADER results into dataframe
sentiment_combined_df = pd.concat([sentiment_r_df, sentiment_v_df]) # combine the two dataframes

In [56]:
sentiment_combined_df

Unnamed: 0,Negative,Neutral,Positive,Method,Sentiment
0,0.699021,0.268417,0.032562,RoBERTa,Negative
1,0.000000,0.000000,1.000000,RoBERTa,Positive
2,0.767167,0.220800,0.012033,RoBERTa,Negative
3,0.000000,0.000000,1.000000,RoBERTa,Positive
4,0.000000,0.000000,1.000000,RoBERTa,Positive
...,...,...,...,...,...
2607,0.057000,0.943000,0.000000,VADER,Negative
2608,0.000000,0.000000,1.000000,VADER,Positive
2609,0.000000,1.000000,0.000000,VADER,Neutral
2610,0.131000,0.668000,0.201000,VADER,Positive


#### Merge tweets_df and sentiment_df

In [57]:
tweets_df_double = pd.concat([tweets_df, tweets_df]) # create a dataframe that is double the original tweets_Df

In [58]:
# combine tweets_double and the combined setniment dataframes
sa_df = pd.concat([tweets_df_double, sentiment_combined_df], axis=1)
sa_df

Unnamed: 0,Date,User,Content,Likes,Retweets,Negative,Neutral,Positive,Method,Sentiment
0,2022-11-07 23:01:03+00:00,YousefBaig,"Newsom claimed Prop. 30 ""puts corporate welfar...",1,4,0.699021,0.268417,0.032562,RoBERTa,Negative
1,2022-11-07 22:59:20+00:00,ClimateResolve,#Prop30 is pivotal to help our state combat da...,5,6,0.000000,0.000000,1.000000,RoBERTa,Positive
2,2022-11-07 22:56:18+00:00,trader_mtg,@ecommerceshares Wait until Prop 30 gets shot ...,1,0,0.767167,0.220800,0.012033,RoBERTa,Negative
3,2022-11-07 22:47:35+00:00,LByock,@dhere I voted no on 27 (which is in conflict ...,1,1,0.000000,0.000000,1.000000,RoBERTa,Positive
4,2022-11-07 22:39:45+00:00,SFBayPSR,"VOTE YES on Prop 30! If we don’t act, Californ...",10,7,0.000000,0.000000,1.000000,RoBERTa,Positive
...,...,...,...,...,...,...,...,...,...,...
2607,2022-07-02 01:01:34+00:00,alfred_twu,Prop 30: Electric Vehicles &amp; Wildfire Prev...,23,3,0.057000,0.943000,0.000000,VADER,Negative
2608,2022-07-01 22:57:49+00:00,wickedmitch_,Tentative votes:\nProp 1: yes\nProp 26: yes\nP...,0,0,0.000000,0.000000,1.000000,VADER,Positive
2609,2022-07-01 21:49:09+00:00,davidaguilar92,California's Secretary of State just released ...,0,0,0.000000,1.000000,0.000000,VADER,Neutral
2610,2022-07-01 20:22:45+00:00,RL_Miller,"Prop 1: OH HELL YES\nProp 26: Maybe, but not w...",1,0,0.131000,0.668000,0.201000,VADER,Positive


##### Save data to CSV

In [59]:
# save final dataframe to CSV
sa_df.to_csv('STA141B_Project_Sentiment_Analysis_Data.csv')