In [None]:
#!pip install stanfordcorelp

In [1]:
# importing relevant libraries
import pandas as pd
import numpy as np

import nltk
#import standfordcorelp

from scipy.stats import ttest_ind, f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols

''For every comment, the body text is converted to lower-case, but not lemmatized
given the lexicon’s inclusion of different word morphologies. The valence and
dominance score of each in-corpus word in the text is summated and averaged over the text’s''

''number of in-corpus words to determine the comment’s average valence and dominance. The
score is averaged over in-corpus word count as other studies have found that only a portion of
words in a text can be expected to be covered [33, 57]; this coverage is expected to be even
smaller on social media text. The calculation of these averages is followed by student t-tests to
detect statistical significance with a critical value of.01. In the case of cross-partisan analysis,
two-way ANOVAs are performed, followed by post-hoc Tukey HSD tests [58], to find significant
differences in means of valence and dominance scores for male and female politicians.
Cohen’s D is used as a measure of effect size.''

''output of a state-of-the-art pretrained sentiment
classifier on the comment text. We rely on a RoBERTa-based sentiment classifier that
outputs a positive or negative sentiment label to a maximum 512 token input text [59]. We
chose this model due to its high reliability across data-sets and tolerance for long input. The
authors report its 93.2% average accuracy across 15 evaluation data-sets (each extracted from
different text sources). We then assess for a significant difference in the categorical output of
this model across genders using a chi-square test. Cramer’s V is used to measure the strength
of the association. Cross-partisan analyses use log-linear analyses to find the most parsimonious
model. The final model is then analysed further using chi-square tests, Cramer’s V, and
odds-ratio comparisons, depending on the significance and strength of the associations.''


## NOTES:

- https://saifmohammad.com/WebPages/nrc-vad.html

- Cite all relevant papers in report!

- I am not sure, if I can replicate the exact analysis due to the 'original' version maybe not being available anymore and further I am not sure if what we wanted to do is not just exactly what they have done anyways...

## Reading in Data and Preprocessing

In [2]:
# importing reddit-sample 
data = pd.read_csv("/kaggle/input/all-comments-sample/all_comments_sample_137K.csv")

# Read the text file into a DataFrame
vad_nrc = pd.read_csv('/kaggle/input/nrc-vad-english/NRC-VAD-Lexicon.txt', sep='\t', header=None, names=['Word', 'Valence', 'Arousal', 'Dominance'])
print(vad_nrc.head())

       Word  Valence  Arousal  Dominance
0  aaaaaaah    0.479    0.606      0.291
1     aaaah    0.520    0.636      0.282
2  aardvark    0.427    0.490      0.437
3     aback    0.385    0.407      0.288
4    abacus    0.510    0.276      0.485


In [3]:
# inspect all column names
print([col for col in data.columns])

['id', 'body', 'subreddit', 'to_type', 'NEL', 'Names', 'created_utc', 'sex', 'ethnicity', 'origin', 'DOB', 'highest_position', 'party', 'entity_given_name', 'entity_family_name', 'given_name_used', 'family_name_used', 'full_name_used', 'nickname_used', 'Adjectives', 'Verbs', 'Nouns', 'Descriptors_parsed', 'Verbs_parsed', 'Relation', 'Valence', 'Arousal', 'Dominance']


In [4]:
# convert all posts to lower case
data['body'] = data['body'].str.lower()

# Sentimental Gender Bias replication from PAPER NAME

In [5]:
# create dictionaries for direct lookup
valence_dict = dict(zip(vad_nrc['Word'], vad_nrc['Valence']))
dominance_dict = dict(zip(vad_nrc['Word'], vad_nrc['Dominance']))
arousal_dict = dict(zip(vad_nrc['Word'], vad_nrc['Arousal']))

## Computation of Average Arousal, Dominance and Valence per Post

In [6]:
# calculate VAD based on their dictionary
def calculate_vad_scores(text, valence_dict, dominance_dict, arousal_dict):
    
    words = text.split()
    valence_scores = [valence_dict[word] for word in words if word in valence_dict]
    dominance_scores = [dominance_dict[word] for word in words if word in dominance_dict]
    arousal_scores = [arousal_dict[word] for word in words if word in arousal_dict]
    
    if len(valence_scores) > 0:
        avg_valence = sum(valence_scores) / len(valence_scores)
    else:
        avg_valence = None
    
    if len(dominance_scores) > 0:
        avg_dominance = sum(dominance_scores) / len(dominance_scores)
    else:
        avg_dominance = None
    
    if len(arousal_scores) > 0:
        avg_arousal = sum(arousal_scores) / len(arousal_scores)
    else:
        avg_arousal = None
        
    return avg_valence, avg_dominance, avg_arousal


In [7]:
# assess each reddit post
data[['avg_valence', 'avg_dominance', 'avg_arousal']] = data['body'].apply(lambda x: calculate_vad_scores(x, valence_dict, dominance_dict, arousal_dict)).apply(pd.Series)

In [8]:
# check results

print(f"Average valence score: {data['avg_valence'].iloc[0]}]")
print(f"Average arousal score: {data['avg_arousal'].iloc[0]}]")
print(f"Average dominance score: {data['avg_dominance'].iloc[0]}]")

Average valence score: 0.5617741935483872]
Average arousal score: 0.427967741935484]
Average dominance score: 0.5668709677419353]


**Replication should have worked? Check and compare results if possible**

## test statistics

In [9]:
# split data by gender
male_comments = data[data['sex'] == 'male']
female_comments = data[data['sex'] == 'female']

# check number of elements that were sorted decisively
print(f"Number of posts that discuss men: {len(male_comments)}")
print(f"Number of posts that discuss women: {len(female_comments)}")

# that looks pretty successful to me!

Number of posts that discuss men: 119631
Number of posts that discuss women: 17917


In [10]:
# perform t tests based on gender as paper
ttest_valence = ttest_ind(male_comments['avg_valence'].dropna(), female_comments['avg_valence'].dropna())
ttest_arousal = ttest_ind(male_comments['avg_arousal'].dropna(), female_comments['avg_arousal'].dropna())
ttest_dominance = ttest_ind(male_comments['avg_dominance'].dropna(), female_comments['avg_dominance'].dropna())

print('t-test results for valence:', ttest_valence)
print('t-test results for arousal:', ttest_arousal)
print('t-test results for dominance:', ttest_dominance)

t-test results for valence: TtestResult(statistic=-6.5485452064822605, pvalue=5.830884169077744e-11, df=134195.0)
t-test results for arousal: TtestResult(statistic=-1.5491996733351003, pvalue=0.1213360861515814, df=134195.0)
t-test results for dominance: TtestResult(statistic=-0.15773808481127355, pvalue=0.8746634261452003, df=134195.0)


mh the only significant result shows up for valence - which would speak for benevolent sexism - check with paper findings

In [15]:
# and run ANOVA
anova_model_valence = ols('avg_valence ~ C(sex)', data=data).fit()
anova_table_valence = sm.stats.anova_lm(anova_model_valence, typ=2)
print('ANOVA Valence:')
print('\n')
print(anova_table_valence)
print('\n')

anova_model_arousal = ols('avg_arousal ~ C(sex)', data=data).fit()
anova_table_arousal = sm.stats.anova_lm(anova_model_arousal, typ=2)
print('ANOVA arousal:')
print('\n')
print(anova_table_arousal)
print('\n')

anova_model_dominance = ols('avg_dominance ~ C(sex)', data=data).fit()
anova_table_dominance = sm.stats.anova_lm(anova_model_dominance, typ=2)
print('ANOVA Dominance:')
print('\n')
print(anova_table_dominance)


ANOVA Valence:


               sum_sq        df          F        PR(>F)
C(sex)       0.469123       3.0  15.780799  2.950748e-10
Residual  1329.799498  134199.0        NaN           NaN


ANOVA arousal:


              sum_sq        df         F    PR(>F)
C(sex)      0.019816       3.0  1.066365  0.361939
Residual  831.271039  134199.0       NaN       NaN


ANOVA Dominance:


              sum_sq        df        F    PR(>F)
C(sex)      0.004559       3.0  0.21978  0.882723
Residual  927.914723  134199.0      NaN       NaN


# Our own method