### Imports

In [48]:
# general
import pandas as pd
import numpy as np

# plotting
import plotly.express as px
import plotly.graph_objects as go

# stats
from statsmodels.stats.proportion import proportions_ztest

### Load in data

In [70]:
sa_df = pd.read_csv('STA141B_Project_Sentiment_Analysis_Data.csv')

### Plots

In [71]:
counts_df = sa_df.groupby('Method')['Sentiment'].value_counts().to_frame() # get counts of each sentiment and turn into dataframe
counts_df.rename(columns = {'Sentiment': 'Count'}, inplace=True) # rename columns
counts_df.reset_index(inplace=True) # reset index
total_counts = counts_df.groupby('Method')['Count'].sum().to_list() # count total number of sentiment
total_counts = np.repeat(total_counts, 3) # repeat total counts to get correct dimension
counts_df['Percentage'] = counts_df['Count'] / total_counts # make new column containing percent of each sentiment

ValueError: operands could not be broadcast together with shapes (5,) (6,) 

In [52]:
# bar plot containing all 3 sentiments for the two models
fig_all = px.bar(data_frame=counts_df, 
                 x = 'Sentiment', 
                 y = 'Count', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Count of Prop. 30 Sentiments on Twitter', 
                 text = 'Count')
fig_all.update_traces(textposition = 'inside', textfont_color = 'white')
fig_all.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Neutral', 'Positive']},
                      font_size = 14)
fig_all.show()

In [53]:
# function to format floats into percent format
def format_percent(val: float) -> str:
    '''
    Given a float value between 0 and 1, returns the value as a string in percent format with 2 decimal places.
    Ex: 0.2356 -> '23.56%'
    '''
    trunc_val = round(val, 4) * 100
    format_val = f'{trunc_val:.4}%'
    return format_val

In [54]:
counts_no_neutral_df = counts_df[counts_df['Sentiment'] != 'Neutral'] # remove rows with neutral sentiment
counts_no_neutral_df = counts_no_neutral_df.drop('Percentage', axis=1) # drop the percentage column. will be remade
total_counts_no_neutral = counts_no_neutral_df.groupby('Method')['Count'].sum().to_list() # get total count of sentiments for each model 
total_counts_no_neutral = np.repeat(total_counts_no_neutral, 2) # repeat total counts to get correct dimension
counts_no_neutral_df['Percentage'] = counts_no_neutral_df['Count'] / total_counts_no_neutral # make new column containing percent of each sentiment
counts_no_neutral_df['Str Percentage'] = counts_no_neutral_df['Percentage'].apply(format_percent) # convert floats into percent-formatted strings

In [55]:
# bar plot of positive and negative sentiments for each model
fig_no_neutral = px.bar(data_frame=counts_no_neutral_df, 
                 x = 'Sentiment', 
                 y = 'Count', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Count of Prop. 30 Sentiments on Twitter (excluding Neutral)', 
                 text = 'Count')
fig_no_neutral.update_traces(textposition = 'inside', textfont_color = 'white')
fig_no_neutral.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_no_neutral.show()

In [56]:
# bar plot of percentages of positive and negative sentiments for each model
fig_perc_no_neutral = px.bar(data_frame=counts_no_neutral_df, 
                 x = 'Sentiment', 
                 y = 'Percentage', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Percentage of Prop. 30 Sentiments on Twitter (excluding Neutral)', 
                 text = 'Str Percentage')
fig_perc_no_neutral.update_traces(textposition = 'inside', textfont_color = 'white')
fig_perc_no_neutral.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_perc_no_neutral.show()

### Poll counts and sample sizes

In [57]:
# assign variables for actual poll results and sample size
poll_negative_count = 6_161_978
poll_positive_count = 4_524_334
poll_sample = poll_negative_count + poll_positive_count

In [58]:
# create dataframe containig poll data
counts_poll_df = pd.DataFrame({'Method': ['Poll', 'Poll'], 'Sentiment': ['Negative', 'Positive'], 'Count': [poll_negative_count, poll_positive_count]})
counts_poll_df['Percentage'] = counts_poll_df['Count'] / poll_sample # percentage of each sentiment
counts_poll_df['Str Percentage'] = counts_poll_df['Percentage'].apply(format_percent) # string representation of percentages

In [59]:
# combina sentiment analysis data and poll data
counts_combined_df = pd.concat([counts_no_neutral_df, counts_poll_df])

In [60]:
# bar plot of percentages of positive and negative sentiment of each model and poll results
fig_combined = px.bar(data_frame=counts_combined_df, 
                 x = 'Sentiment', 
                 y = 'Percentage', 
                 color = 'Method', 
                 barmode = 'group', 
                 title = 'Percentage of Prop. 30 Sentiments on Twitter (excluding Neutral) compared with Poll results', 
                 text = 'Str Percentage')
fig_combined.update_traces(textposition = 'inside', textfont_color = 'white')
fig_combined.update_layout(xaxis = {'categoryorder': 'array', 'categoryarray': ['Negative', 'Positive']},
                      font_size = 14)
fig_combined.show()

### Hypothesis testing

We wil be comparing the proportion of positive/negative _sentiments_ ($P_S$) and the proportion of Yes/No votes on the _real_ ballot ($P_R$) by performing a **two-proportion z-test**.

Our null hypothesis is that the two proportion have a difference of $0$: $H_0:$ $P_S - P_R = 0$.

Our alternative hypothesis is that the two proportions are different: $H_A:$ $P_S - P_R \ne 0$.

We will be using a significance level of $\alpha = 0.05$.

In [61]:
# define significance value
alpha = 0.05

In [62]:
# get positive and negative counts for each model, as well as sample size
r_negative_count, r_positive_count = counts_no_neutral_df[(counts_no_neutral_df['Method'] == 'RoBERTa')]['Count'].to_list()
v_positive_count, v_negative_count = counts_no_neutral_df[(counts_no_neutral_df['Method'] == 'VADER')]['Count'].to_list()
r_sample = r_negative_count + r_positive_count
v_sample = v_negative_count + v_positive_count

**RoBERTa**

In [65]:
# comparing Negative and No
z_stat_r_neg, p_value_r_neg = proportions_ztest(count = [r_negative_count, poll_negative_count], nobs = [r_sample, poll_sample], value = 0.0, alternative= 'two-sided')

In [64]:
# comparing Positive and Yes
z_stat_r_pos, p_value_r_pos = proportions_ztest(count = [r_positive_count, poll_positive_count], nobs = [r_sample, poll_sample], value = 0.0, alternative= 'two-sided')

Based on these two p-values, we fail to reject the null hypothesis for both of these tests. Thus, there is not sufficient evidence to conclude that the proportion of positive and negative sentiments on Twitter using the RoBERTa model is different than the proportions of Yes and No votes on the actual ballot respectively. 

**VADER**

In [66]:
# comparing Negative and No
z_stat_v_neg, p_value_v_neg = proportions_ztest(count = [v_negative_count, poll_negative_count], nobs = [v_sample, poll_sample], value = 0.0, alternative= 'two-sided')

In [67]:
# comparing Positive and Yes
z_stat_v_pos, p_value_v_pos = proportions_ztest(count = [v_positive_count, poll_positive_count], nobs = [v_sample, poll_sample], value = 0.0, alternative= 'two-sided')


Based on these two p-values, we reject the null hypothesis for both of these tests. Thus, there is sufficient evidence to conclude that the proportion of positive and negative sentiments on Twitter using the VADER model is different than the proportions of Yes and No votes on the actual ballot respectively. 

#### P-value Table

In [68]:
methods = ['RoBERTa', 'RoBERTa','VADER', 'VADER'] # list of methods
positions = ['Positive', 'Negative', 'Positive', 'Negative'] # list of positions
pvalues = [round(p_value_r_pos, 3), round(p_value_r_neg, 3), float(f'{p_value_v_pos:.3e}'), float(f'{p_value_v_neg:.3e}')] # list of pvalues
conclusions = ['Reject' if p <= alpha else 'Fail to Reject' for p in pvalues] # list of conclusions

In [69]:
# make table containing p-value data
pvalue_table = go.Figure(data = [go.Table(
    header = dict(values = ['<b>Method</b>', '<b>Sentiment', '<b>P-value</b>', '<b>Conclusion</b>'],
                  fill_color = 'grey',
                  line_color = 'darkslategray',
                  font = dict(color = 'white', size = 14),
                  align = 'left'),
    cells = dict(values = [methods, positions, pvalues, conclusions],
                 fill_color = [['white', 'lightgrey'] * 2],
                 line_color = 'darkslategray',
                 align = 'left')
)])
pvalue_table.show()