### Imports

In [41]:
# general
import pandas as pd
import numpy as np
import re
import plotnine as p9
import plotly.express as px

# twitter scraping
import snscrape.modules.twitter as sntwitter

# roBERTa
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from scipy.special import softmax
import torch

# nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# stats
from statsmodels.stats.proportion import proportions_ztest

### Twitter scraping

In [2]:
query = '"prop 30"  until:2022-11-08 since:2022-07-01'
tweets = []
limit = 5000

In [3]:
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    if len(tweets) == limit:
        break
    tweet_data = [tweet.date, tweet.user.username, tweet.content, tweet.likeCount, tweet.retweetCount]
    tweets.append(tweet_data)

In [4]:
tweets_df = pd.DataFrame(tweets, columns = ['Date', 'User', 'Content', 'Likes', 'Retweets'])

In [5]:
tweets_df

Unnamed: 0,Date,User,Content,Likes,Retweets
0,2022-11-07 23:01:03+00:00,YousefBaig,"Newsom claimed Prop. 30 ""puts corporate welfar...",1,4
1,2022-11-07 22:59:20+00:00,ClimateResolve,#Prop30 is pivotal to help our state combat da...,5,6
2,2022-11-07 22:56:18+00:00,trader_mtg,@ecommerceshares Wait until Prop 30 gets shot ...,1,0
3,2022-11-07 22:47:35+00:00,LByock,@dhere I voted no on 27 (which is in conflict ...,1,1
4,2022-11-07 22:39:45+00:00,SFBayPSR,"VOTE YES on Prop 30! If we don’t act, Californ...",10,7
...,...,...,...,...,...
2674,2022-07-02 01:01:34+00:00,alfred_twu,Prop 30: Electric Vehicles &amp; Wildfire Prev...,23,3
2675,2022-07-01 22:57:49+00:00,wickedmitch_,Tentative votes:\nProp 1: yes\nProp 26: yes\nP...,0,0
2676,2022-07-01 21:49:09+00:00,davidaguilar92,California's Secretary of State just released ...,0,0
2677,2022-07-01 20:22:45+00:00,RL_Miller,"Prop 1: OH HELL YES\nProp 26: Maybe, but not w...",1,0


### Data Preprocessing Function

In [6]:
def preprocess(text: str) -> str:
    '''
    Performs basic preprocessing on text to meet specifications of roBERTa model.
    References to other users ('@user123') will be replaced with '@user'.
    Any links ('https://www.google.com/') will be replaced with 'http'
    '''
    elements = []
    for element in text.split(' '):
        if element.startswith('@') and len(element) > 1:
            element = '@user'
        elif element.startswith('http'):
            element = 'hhtp'
        elements.append(element)
    return ' '.join(elements)

### Load in NLTK's [VADER](https://www.nltk.org/_modules/nltk/sentiment/vader.html) sentiment analyzer

In [13]:
nltk_sia = SentimentIntensityAnalyzer()

### Load in [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta) model from [Hugging Face](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=Covid+cases+are+increasing+fast%21)

In [14]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Perform sentiment analysis with VADER and RoBERTa

In [43]:
def roberta(text: str):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    labels = config.id2label.values()
    scores_dict = dict(zip(labels, scores))
    
    scores_dict['Method'] = 'RoBERTa'
    return scores_dict

In [44]:
def vader(text: str):
    text = preprocess(text)
    scores = nltk_sia.polarity_scores(text)
    scores.pop('compound')
    scores = list(scores.values())

    key_names = ['Negative', 'Neutral', 'Positive']
    scores_dict = dict(zip(key_names, scores))
    scores_dict['Method'] = 'VADER'
    return scores_dict

In [45]:
def get_sentiment(scores):
    method = scores.pop('Method')
    sentiment = max(scores, key = scores.get)
    scores['Method'] = method
    return sentiment

In [50]:
def sentiment_analysis(text:str, method:str):
    '''
    First checks if tweet contains various phrases that result in a trivial sentiment determination.
    Otherwise, performs roBERTa sentiment analysis.
    Returns a dictionary containing the probabilities of each sentiment and the sentiment with the greatest probability.
    For our analysis, Yes = 'Positive' and No = 'Negative'.
    '''
    
    positive_regex = 'yeson.{0,5}(prop)?.?30|prop.?30.?yes'
    negative_regex = 'noon.{0,5}(prop)?.?30|prop.?30.?no'

    lower_text = text.lower()
    # contains positive phrases
    if bool(re.search(positive_regex, lower_text.replace(' ',''))):
        sentiment_scores = {'Negative': 0, 'Neutral': 0, 'Positive': 1, 'Sentiment': 'Positive', 'Method': method}
    # contains negative phrases
    elif bool(re.search(negative_regex, lower_text.lower().replace(' ',''))):
        sentiment_scores = {'Negative': 1, 'Neutral': 0, 'Positive': 0, 'Sentiment': 'Negative', 'Method': method}
    else:
        if method == 'RoBERTa':
            sentiment_scores = roberta(text)

        elif method == 'VADER':
            sentiment_scores = vader(text)
        else:
            return f'unknown model: {method}'
          
        sentiment = get_sentiment(sentiment_scores)
        sentiment_scores['Sentiment'] = sentiment
    
    return sentiment_scores

#### Run sentiment analysis on all tweets

In [51]:
results_r = []
results_v = []
for index, row in tweets_df.iterrows():
    sentiment_scores_roberta = sentiment_analysis(row['Content'], 'RoBERTa')
    sentiment_scores_vader = sentiment_analysis(row['Content'], 'VADER')
    results_r.append(sentiment_scores_roberta)
    results_v.append(sentiment_scores_vader)
sentiment_r_df = pd.DataFrame(results_r)
sentiment_v_df = pd.DataFrame(results_v)
sentiment_combined_df = pd.concat([sentiment_r_df, sentiment_v_df])

In [52]:
sentiment_r_df

Unnamed: 0,Negative,Neutral,Positive,Method,Sentiment
0,0.699021,0.268417,0.032562,RoBERTa,Negative
1,0.000000,0.000000,1.000000,RoBERTa,Positive
2,0.767167,0.220800,0.012033,RoBERTa,Negative
3,0.000000,0.000000,1.000000,RoBERTa,Positive
4,0.000000,0.000000,1.000000,RoBERTa,Positive
...,...,...,...,...,...
2674,0.057230,0.804284,0.138487,RoBERTa,Neutral
2675,0.000000,0.000000,1.000000,RoBERTa,Positive
2676,0.120726,0.858503,0.020771,RoBERTa,Neutral
2677,0.143333,0.739530,0.117137,RoBERTa,Neutral


#### Merge tweets_df and sentiment_df

In [53]:
sentiment_combined_df

Unnamed: 0,Negative,Neutral,Positive,Method,Sentiment
0,0.699021,0.268417,0.032562,RoBERTa,Negative
1,0.000000,0.000000,1.000000,RoBERTa,Positive
2,0.767167,0.220800,0.012033,RoBERTa,Negative
3,0.000000,0.000000,1.000000,RoBERTa,Positive
4,0.000000,0.000000,1.000000,RoBERTa,Positive
...,...,...,...,...,...
2674,0.057000,0.943000,0.000000,VADER,Neutral
2675,0.000000,0.000000,1.000000,VADER,Positive
2676,0.000000,1.000000,0.000000,VADER,Neutral
2677,0.131000,0.668000,0.201000,VADER,Neutral


In [54]:
tweets_df_double = pd.concat([tweets_df, tweets_df])

In [55]:
sa_df = pd.concat([tweets_df_double, sentiment_combined_df], axis=1)
sa_df

Unnamed: 0,Date,User,Content,Likes,Retweets,Negative,Neutral,Positive,Method,Sentiment
0,2022-11-07 23:01:03+00:00,YousefBaig,"Newsom claimed Prop. 30 ""puts corporate welfar...",1,4,0.699021,0.268417,0.032562,RoBERTa,Negative
1,2022-11-07 22:59:20+00:00,ClimateResolve,#Prop30 is pivotal to help our state combat da...,5,6,0.000000,0.000000,1.000000,RoBERTa,Positive
2,2022-11-07 22:56:18+00:00,trader_mtg,@ecommerceshares Wait until Prop 30 gets shot ...,1,0,0.767167,0.220800,0.012033,RoBERTa,Negative
3,2022-11-07 22:47:35+00:00,LByock,@dhere I voted no on 27 (which is in conflict ...,1,1,0.000000,0.000000,1.000000,RoBERTa,Positive
4,2022-11-07 22:39:45+00:00,SFBayPSR,"VOTE YES on Prop 30! If we don’t act, Californ...",10,7,0.000000,0.000000,1.000000,RoBERTa,Positive
...,...,...,...,...,...,...,...,...,...,...
2674,2022-07-02 01:01:34+00:00,alfred_twu,Prop 30: Electric Vehicles &amp; Wildfire Prev...,23,3,0.057000,0.943000,0.000000,VADER,Neutral
2675,2022-07-01 22:57:49+00:00,wickedmitch_,Tentative votes:\nProp 1: yes\nProp 26: yes\nP...,0,0,0.000000,0.000000,1.000000,VADER,Positive
2676,2022-07-01 21:49:09+00:00,davidaguilar92,California's Secretary of State just released ...,0,0,0.000000,1.000000,0.000000,VADER,Neutral
2677,2022-07-01 20:22:45+00:00,RL_Miller,"Prop 1: OH HELL YES\nProp 26: Maybe, but not w...",1,0,0.131000,0.668000,0.201000,VADER,Neutral


### Plots

In [69]:
counts_df = sa_df.groupby('Method')['Sentiment'].value_counts().to_frame()
counts_df.rename(columns = {'Sentiment': 'Count'}, inplace=True)
counts_df.reset_index(inplace=True)
total_counts = counts_df.groupby('Method')['Count'].sum().to_list()
total_counts = np.repeat(total_counts, 3)
counts_df['Percentage'] = counts_df['Count'] / total_counts

In [70]:
counts_df

Unnamed: 0,Method,Sentiment,Count,Percentage
0,RoBERTa,Neutral,1106,0.412841
1,RoBERTa,Negative,933,0.348264
2,RoBERTa,Positive,640,0.238895
3,VADER,Neutral,1970,0.735349
4,VADER,Positive,436,0.162747
5,VADER,Negative,273,0.101904


In [85]:
fig_all = px.bar(data_frame=counts_df, 
                 x = 'Sentiment', 
                 y = 'Count', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Count of Prop. 30 Sentiments on Twitter', 
                 text = 'Count')
fig_all.update_traces(textposition = 'inside', textfont_color = 'white')
fig_all.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Neutral', 'Positive']},
                      font_size = 14)
fig_all.show()

In [133]:
def format_percent(val: float) -> str:
    trunc_val = round(val, 4) * 100
    format_val = f'{trunc_val:.4}%'
    return format_val

In [134]:
counts_no_neutral_df = counts_df[counts_df['Sentiment'] != 'Neutral']
counts_no_neutral_df = counts_no_neutral_df.drop('Percentage', axis=1)
total_counts_no_neutral = counts_no_neutral_df.groupby('Method')['Count'].sum().to_list()
total_counts_no_neutral = np.repeat(total_counts_no_neutral, 2)
counts_no_neutral_df['Percentage'] = counts_no_neutral_df['Count'] / total_counts_no_neutral
counts_no_neutral_df['Str Percentage'] = counts_no_neutral_df['Percentage'].apply(format_percent)
counts_no_neutral_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
1,RoBERTa,Negative,933,0.593134,59.31%
2,RoBERTa,Positive,640,0.406866,40.69%
4,VADER,Positive,436,0.614951,61.5%
5,VADER,Negative,273,0.385049,38.5%


In [99]:
fig_no_neutral = px.bar(data_frame=counts_no_neutral_df, 
                 x = 'Sentiment', 
                 y = 'Count', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Count of Prop. 30 Sentiments on Twitter (excluding Neutral)', 
                 text = 'Count')
fig_no_neutral.update_traces(textposition = 'inside', textfont_color = 'white')
fig_no_neutral.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_no_neutral.show()

In [132]:
fig_perc_no_neutral = px.bar(data_frame=counts_no_neutral_df, 
                 x = 'Sentiment', 
                 y = 'Percentage', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Percentage of Prop. 30 Sentiments on Twitter (excluding Neutral)', 
                 text = 'Str Percentage')
fig_perc_no_neutral.update_traces(textposition = 'inside', textfont_color = 'white')
fig_perc_no_neutral.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_perc_no_neutral.show()

In [138]:
counts_no_neutral_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
1,RoBERTa,Negative,933,0.593134,59.31%
2,RoBERTa,Positive,640,0.406866,40.69%
4,VADER,Positive,436,0.614951,61.5%
5,VADER,Negative,273,0.385049,38.5%


##### Poll counts and sample size

In [36]:
poll_negative_count = 5_953_218
poll_positive_count = 4_365_003
poll_sample = poll_negative_count + poll_positive_count

In [141]:
counts_poll_df = pd.DataFrame({'Method': ['Poll', 'Poll'], 'Sentiment': ['Negative', 'Positive'], 'Count': [poll_negative_count, poll_positive_count]})
counts_poll_df['Percentage'] = counts_poll_df['Count'] / poll_sample
counts_poll_df['Str Percentage'] = counts_poll_df['Percentage'].apply(format_percent)
counts_poll_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
0,Poll,Negative,5953218,0.576962,57.7%
1,Poll,Positive,4365003,0.423038,42.3%


In [146]:
counts_combined_df = pd.concat([counts_no_neutral_df, counts_poll_df])
counts_combined_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
1,RoBERTa,Negative,933,0.593134,59.31%
2,RoBERTa,Positive,640,0.406866,40.69%
4,VADER,Positive,436,0.614951,61.5%
5,VADER,Negative,273,0.385049,38.5%
0,Poll,Negative,5953218,0.576962,57.7%
1,Poll,Positive,4365003,0.423038,42.3%


In [147]:
fig_combined = px.bar(data_frame=counts_combined_df, 
                 x = 'Sentiment', 
                 y = 'Percentage', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Percentage of Prop. 30 Sentiments on Twitter (excluding Neutral) compared with Poll results', 
                 text = 'Str Percentage')
fig_combined.update_traces(textposition = 'inside', textfont_color = 'white')
fig_combined.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_combined.show()

### Hypothesis testing

We wil be comparing the proportion of positive/negative _sentiments_ ($P_S$) and the proportion of Yes/No votes on the _real_ ballot ($P_R$) by performing a **two-proportion z-test**.

Our null hypothesis is that the two proportion have a difference of $0$: $H_0:$ $P_S - P_R = 0$.

Our alternative hypothesis is that the two proportions are different: $H_A:$ $P_S - P_R \ne 0$.

We will be using a significance level of $\alpha = 0.05$.

In [34]:
alpha = 0.05

In [32]:
def HTest_Response(pvalue, alpha):
    if pvalue <= alpha:
        return f'{pvalue} is less than or equal to {alpha}, therefore reject the null hypothesis.'
    else:
        return f'{pvalue} is greater than {alpha}, therefore fail to reject the null hypothesis.'

##### Twitter counts and sample sizes

In [35]:
r_negative_count, r_positive_count = counts_df[(counts_df['Method'] == 'R')]['Count'].to_list()
v_negative_count, v_positive_count = counts_df[(counts_df['Method'] == 'V')]['Count'].to_list()
r_sample = r_negative_count + r_positive_count
v_sample = v_negative_count + v_positive_count

**RoBERTa**

In [37]:
# comparing Negative and No
z_stat_r_neg, p_value_r_neg = proportions_ztest(count = [r_negative_count, poll_negative_count], nobs = [r_sample, poll_sample], value = 0.0, alternative= 'two-sided')
HTest_Response(p_value_r_neg, alpha)

'0.19421551764494482 is greater than 0.05, therefore fail to reject the null hypothesis.'

In [38]:
# comparing Positive and Yes
z_stat_r_pos, p_value_r_pos = proportions_ztest(count = [r_positive_count, poll_positive_count], nobs = [r_sample, poll_sample], value = 0.0, alternative= 'two-sided')
HTest_Response(p_value_r_pos, alpha)

'0.19421551764494338 is greater than 0.05, therefore fail to reject the null hypothesis.'

Based these two p-values, we reject the null hypothesis for both of these tests. Thus, there is sufficient evidence to conclude that the proportion of positive and negative sentiments on Twitter using the RoBERTa model is equal to the proportions of Yes and No votes on the actual ballot respectively. 

**VADER**

In [39]:
# comparing Negative and No
z_stat_v_neg, p_value_v_neg = proportions_ztest(count = [v_negative_count, poll_negative_count], nobs = [v_sample, poll_sample], value = 0.0, alternative= 'two-sided')
HTest_Response(p_value_v_neg, alpha)

'0.04061882890303825 is less than or equal to 0.05, therefore reject the null hypothesis.'

In [40]:
# comparing Positive and Yes
z_stat_v_pos, p_value_v_pos = proportions_ztest(count = [v_positive_count, poll_positive_count], nobs = [v_sample, poll_sample], value = 0.0, alternative= 'two-sided')
HTest_Response(p_value_v_pos, alpha)

'0.04061882890303825 is less than or equal to 0.05, therefore reject the null hypothesis.'

Based these two p-values, we fail to reject the null hypothesis for both of these tests. Thus, there is not sufficient evidence to conclude that the proportion of positive and negative sentiments on Twitter using the RoBERTa model is equal to the proportions of Yes and No votes on the actual ballot respectively. 