### Imports

In [4]:
# general
import pandas as pd

# plotting
import plotly.express as px
import numpy as np

# stats
from statsmodels.stats.proportion import proportions_ztest

### Load in data

In [5]:
sa_df = pd.read_csv('STA141B_Project_Sentiment_Analysis_Data.csv')

### Plots

In [6]:
counts_df = sa_df.groupby('Method')['Sentiment'].value_counts().to_frame()
counts_df.rename(columns = {'Sentiment': 'Count'}, inplace=True)
counts_df.reset_index(inplace=True)
total_counts = counts_df.groupby('Method')['Count'].sum().to_list()
total_counts = np.repeat(total_counts, 3)
counts_df['Percentage'] = counts_df['Count'] / total_counts

In [7]:
counts_df

Unnamed: 0,Method,Sentiment,Count,Percentage
0,RoBERTa,Neutral,1089,0.415966
1,RoBERTa,Negative,891,0.340336
2,RoBERTa,Positive,638,0.243697
3,VADER,Neutral,1936,0.739496
4,VADER,Positive,436,0.166539
5,VADER,Negative,246,0.093965


In [8]:
fig_all = px.bar(data_frame=counts_df, 
                 x = 'Sentiment', 
                 y = 'Count', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Count of Prop. 30 Sentiments on Twitter', 
                 text = 'Count')
fig_all.update_traces(textposition = 'inside', textfont_color = 'white')
fig_all.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Neutral', 'Positive']},
                      font_size = 14)
fig_all.show()

In [9]:
def format_percent(val: float) -> str:
    trunc_val = round(val, 4) * 100
    format_val = f'{trunc_val:.4}%'
    return format_val

In [10]:
counts_no_neutral_df = counts_df[counts_df['Sentiment'] != 'Neutral']
counts_no_neutral_df = counts_no_neutral_df.drop('Percentage', axis=1)
total_counts_no_neutral = counts_no_neutral_df.groupby('Method')['Count'].sum().to_list()
total_counts_no_neutral = np.repeat(total_counts_no_neutral, 2)
counts_no_neutral_df['Percentage'] = counts_no_neutral_df['Count'] / total_counts_no_neutral
counts_no_neutral_df['Str Percentage'] = counts_no_neutral_df['Percentage'].apply(format_percent)
counts_no_neutral_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
1,RoBERTa,Negative,891,0.582734,58.27%
2,RoBERTa,Positive,638,0.417266,41.73%
4,VADER,Positive,436,0.639296,63.93%
5,VADER,Negative,246,0.360704,36.07%


In [11]:
fig_no_neutral = px.bar(data_frame=counts_no_neutral_df, 
                 x = 'Sentiment', 
                 y = 'Count', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Count of Prop. 30 Sentiments on Twitter (excluding Neutral)', 
                 text = 'Count')
fig_no_neutral.update_traces(textposition = 'inside', textfont_color = 'white')
fig_no_neutral.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_no_neutral.show()

In [12]:
fig_perc_no_neutral = px.bar(data_frame=counts_no_neutral_df, 
                 x = 'Sentiment', 
                 y = 'Percentage', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Percentage of Prop. 30 Sentiments on Twitter (excluding Neutral)', 
                 text = 'Str Percentage')
fig_perc_no_neutral.update_traces(textposition = 'inside', textfont_color = 'white')
fig_perc_no_neutral.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_perc_no_neutral.show()

In [13]:
counts_no_neutral_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
1,RoBERTa,Negative,891,0.582734,58.27%
2,RoBERTa,Positive,638,0.417266,41.73%
4,VADER,Positive,436,0.639296,63.93%
5,VADER,Negative,246,0.360704,36.07%


### Poll counts and sample sizes

In [37]:
poll_negative_count = 6_152_636
poll_positive_count = 4_517_942
poll_sample = poll_negative_count + poll_positive_count

In [38]:
counts_poll_df = pd.DataFrame({'Method': ['Poll', 'Poll'], 'Sentiment': ['Negative', 'Positive'], 'Count': [poll_negative_count, poll_positive_count]})
counts_poll_df['Percentage'] = counts_poll_df['Count'] / poll_sample
counts_poll_df['Str Percentage'] = counts_poll_df['Percentage'].apply(format_percent)
counts_poll_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
0,Poll,Negative,6152636,0.576598,57.66%
1,Poll,Positive,4517942,0.423402,42.34%


In [39]:
counts_combined_df = pd.concat([counts_no_neutral_df, counts_poll_df])
counts_combined_df

Unnamed: 0,Method,Sentiment,Count,Percentage,Str Percentage
1,RoBERTa,Negative,891,0.582734,58.27%
2,RoBERTa,Positive,638,0.417266,41.73%
4,VADER,Positive,436,0.639296,63.93%
5,VADER,Negative,246,0.360704,36.07%
0,Poll,Negative,6152636,0.576598,57.66%
1,Poll,Positive,4517942,0.423402,42.34%


In [40]:
fig_combined = px.bar(data_frame=counts_combined_df, 
                 x = 'Sentiment', 
                 y = 'Percentage', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Percentage of Prop. 30 Sentiments on Twitter (excluding Neutral) compared with Poll results', 
                 text = 'Str Percentage')
fig_combined.update_traces(textposition = 'inside', textfont_color = 'white')
fig_combined.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_combined.show()

### Hypothesis testing

We wil be comparing the proportion of positive/negative _sentiments_ ($P_S$) and the proportion of Yes/No votes on the _real_ ballot ($P_R$) by performing a **two-proportion z-test**.

Our null hypothesis is that the two proportion have a difference of $0$: $H_0:$ $P_S - P_R = 0$.

Our alternative hypothesis is that the two proportions are different: $H_A:$ $P_S - P_R \ne 0$.

We will be using a significance level of $\alpha = 0.05$.

In [18]:
alpha = 0.05

In [42]:
r_negative_count, r_positive_count = counts_no_neutral_df[(counts_no_neutral_df['Method'] == 'RoBERTa')]['Count'].to_list()
v_positive_count, v_negative_count = counts_no_neutral_df[(counts_no_neutral_df['Method'] == 'VADER')]['Count'].to_list()
r_sample = r_negative_count + r_positive_count
v_sample = v_negative_count + v_positive_count

**RoBERTa**

In [43]:
# comparing Negative and No
z_stat_r_neg, p_value_r_neg = proportions_ztest(count = [r_negative_count, poll_negative_count], nobs = [r_sample, poll_sample], value = 0.0, alternative= 'two-sided')
p_value_r_neg

0.6272987780888049

In [44]:
# comparing Positive and Yes
z_stat_r_pos, p_value_r_pos = proportions_ztest(count = [r_positive_count, poll_positive_count], nobs = [r_sample, poll_sample], value = 0.0, alternative= 'two-sided')
p_value_r_pos

0.6272987780888112

Based these two p-values, we reject the null hypothesis for both of these tests. Thus, there is sufficient evidence to conclude that the proportion of positive and negative sentiments on Twitter using the RoBERTa model is equal to the proportions of Yes and No votes on the actual ballot respectively. 

**VADER**

In [45]:
# comparing Negative and No
z_stat_v_neg, p_value_v_neg = proportions_ztest(count = [v_negative_count, poll_negative_count], nobs = [v_sample, poll_sample], value = 0.0, alternative= 'two-sided')
p_value_v_neg

3.70581522961325e-30

In [46]:
# comparing Positive and Yes
z_stat_v_pos, p_value_v_pos = proportions_ztest(count = [v_positive_count, poll_positive_count], nobs = [v_sample, poll_sample], value = 0.0, alternative= 'two-sided')
p_value_v_pos


3.7058152296131435e-30

Based these two p-values, we fail to reject the null hypothesis for both of these tests. Thus, there is not sufficient evidence to conclude that the proportion of positive and negative sentiments on Twitter using the RoBERTa model is equal to the proportions of Yes and No votes on the actual ballot respectively. 