In [42]:
import pandas as pd
import pickle
import os
import glob
from scipy.stats import chi2_contingency
import numpy as np

def open_final(model_name):
    with open(f'data/final_scores_{model_name}.pkl', 'rb') as f:
        data = pickle.load(f)
    return data

In [2]:
df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)

## Extending dataframe by adding emotions and sentiment based on mood labels

In [3]:
emomap = {'Sadness': ['Sad', 'Lonely'],
         'Anger': ['Angry', 'Annoyed', 'Frustrated', 'Furious'],
         'Fear': ['Anxious', 'Stressed', 'Afraid', 'Nervous', 'Worried'],
         'Affection': ['Loving', 'Caring', 'Supportive'],
         'Happiness': ['Happy', 'Excited']}

emomap_r = {}
for emo in emomap:
    for label in emomap[emo]:
        emomap_r[label] = emo

sentimap = {'Sadness': 'negative', 'Anger': 'negative', 'Fear': 'negative', 'Affection': 'positive', 'Happiness': 'positive'}

In [4]:
df['emotion'] = df['mood'].map(emomap_r)
df['sentiment'] = df['emotion'].map(sentimap)

In [5]:
df.to_csv('data/dataset_extended.csv', index=False, quoting=1, escapechar='\\', doublequote=True)

## Computing errors
Note that total error is not equal to valence error plus salience error. If for example the actual
sentiment is negative but it is predicted to be positive in 33.3% of cases and neutral in 33.3% of cases
then the valence error is 50%, the salience error is 50%, and the total error is 66.6%.
The goal of this definition is to separate salience and valence errors.

In [43]:
from scipy.stats import chi2_contingency
import numpy as np

def compute_valence_metrics_by_gender(non_neutral_df):
    """
    Calculate valence error rates by gender and perform chi-square test
    """
    # Initialize results
    result = {
        'valence_error_0': 0,  # Error rate for sex=0
        'valence_error_1': 0,  # Error rate for sex=1
        'valence_p': np.nan    # p-value for gender comparison
    }
    
    if len(non_neutral_df) == 0:
        return result
    
    # Create contingency table [gender, correct/incorrect]
    contingency = np.zeros((2, 2))
    
    # Calculate error rates and fill contingency table for each gender
    for i, sex_value in enumerate([0, 1]):
        sex_df = non_neutral_df[non_neutral_df['sex'] == sex_value]
        
        # Calculate error rate for this gender
        if len(sex_df) > 0:
            error_rate = sum(sex_df['is_valence_error']) / len(sex_df)
            result[f'valence_error_{sex_value}'] = error_rate
            
            # Fill contingency table
            # Correct predictions (column 0)
            contingency[i, 0] = sum(~sex_df['is_valence_error'])
            # Incorrect predictions (column 1)
            contingency[i, 1] = sum(sex_df['is_valence_error'])
    
    # Perform chi-square test if possible
    if np.all(contingency > 0):
        _, p_value, _, _ = chi2_contingency(contingency)
        result['valence_p'] = p_value
    elif np.any(contingency > 0):
        result['valence_p'] = 1.0
    
    return result

def compute_salience_metrics_by_gender(potential_correct_df):
    """
    Calculate salience error rates by gender and perform chi-square test
    """
    # Initialize results
    result = {
        'salience_error_0': 0,  # Error rate for sex=0
        'salience_error_1': 0,  # Error rate for sex=1
        'salience_p': np.nan    # p-value for gender comparison
    }
    
    if len(potential_correct_df) == 0:
        return result
    
    # Create contingency table [gender, non-neutral/neutral]
    contingency = np.zeros((2, 2))
    
    # Calculate error rates and fill contingency table for each gender
    for i, sex_value in enumerate([0, 1]):
        sex_df = potential_correct_df[potential_correct_df['sex'] == sex_value]
        
        # Calculate error rate for this gender
        if len(sex_df) > 0:
            error_rate = sum(sex_df['is_neutral_pred']) / len(sex_df)
            result[f'salience_error_{sex_value}'] = error_rate
            
            # Fill contingency table
            # Non-neutral predictions (column 0)
            contingency[i, 0] = sum(~sex_df['is_neutral_pred'])
            # Neutral predictions (column 1)
            contingency[i, 1] = sum(sex_df['is_neutral_pred'])
    
    # Perform chi-square test if possible
    if np.all(contingency > 0):
        _, p_value, _, _ = chi2_contingency(contingency)
        result['salience_p'] = p_value
    elif np.any(contingency > 0):
        result['salience_p'] = 1.0
    
    return result

def compute_errors_by_gender(df, scores):
    """
    Compute valence and salience errors by gender with statistical comparison
    """
    # Filter df to only include rows where id is in scores
    valid_df = df[df['id'].isin(scores.keys())].copy()
    
    # If no valid rows, return empty results
    if len(valid_df) == 0:
        return {
            'valence_error_0': 0,
            'valence_error_1': 0,
            'valence_p': np.nan,
            'salience_error_0': 0,
            'salience_error_1': 0,
            'salience_p': np.nan
        }
    
    # Add predictions to the dataframe
    valid_df['predicted'] = valid_df['id'].map(scores)
    
    # Create flags for different types of predictions
    valid_df['is_neutral_pred'] = valid_df['predicted'] == 'neutral'
    valid_df['is_valence_error'] = (valid_df['sentiment'] != valid_df['predicted']) & (~valid_df['is_neutral_pred'])
    
    # Calculate valence metrics
    non_neutral_df = valid_df[~valid_df['is_neutral_pred']]
    valence_results = compute_valence_metrics_by_gender(non_neutral_df)
    
    # Calculate salience metrics
    potential_correct_df = valid_df[~valid_df['is_valence_error']]
    salience_results = compute_salience_metrics_by_gender(potential_correct_df)
    
    # Combine results
    return {**valence_results, **salience_results}

In [44]:
def compute_group_errors(df, scores):
    # Create empty list to store results
    result_data = []
    
    # Define the levels and their possible values
    levels = {
        'sentiment': ['positive', 'negative'],
        'emotion': df['emotion'].unique().tolist(),  # Get all unique emotion values
        'mood': df['mood'].unique().tolist()         # Get all unique mood values
    }
    
    # Iterate through levels (sentiment, emotion, mood)
    for level, level_values in levels.items():
        # Iterate through each possible value within the level
        for level_value in level_values:
            # Filter the dataframe for this level value
            df_filtered = df[df[level] == level_value]
            
            # Skip if empty
            if df_filtered.empty:
                continue
                
            # Compute errors with gender comparison for this subset
            errors = compute_errors_by_gender(df_filtered, scores)
            
            # Add valence error for each gender
            for sex_value in [0, 1]:
                # Add valence error
                result_data.append({
                    'sex': sex_value,
                    'level': level,
                    'level_value': level_value,
                    'error_type': 'valence',
                    'error_value': errors[f'valence_error_{sex_value}'],
                    'p_value': errors['valence_p']
                })
                
                # Add salience error
                result_data.append({
                    'sex': sex_value,
                    'level': level,
                    'level_value': level_value,
                    'error_type': 'salience',
                    'error_value': errors[f'salience_error_{sex_value}'],
                    'p_value': errors['salience_p']
                })
    
    # Create DataFrame from the collected data
    result_df = pd.DataFrame(result_data)
    
    return result_df

Note that 0 is man and 1 is woman

In [45]:
df = pd.read_csv('data/dataset_extended.csv', quoting=1, escapechar='\\', doublequote=True)

model_files = glob.glob('data/final_scores_*.pkl')

result = []

for model_file in model_files:
    print(model_file)
    model_name = model_file.split('final_scores_')[1].split('.pkl')[0]
    
    scores = open_final(model_name)
    
    errors = compute_group_errors(df, scores)
    errors['model'] = model_name
    
    result.append(errors)

result = pd.concat(result, ignore_index=True)

data/final_scores_vader.pkl
data/final_scores_olmo.pkl
data/final_scores_hartman.pkl
data/final_scores_falcon.pkl
data/final_scores_pysentimiento_senti.pkl
data/final_scores_hartmann.pkl
data/final_scores_leia.pkl
data/final_scores_cardif.pkl
data/final_scores_llama8.pkl
data/final_scores_siebert.pkl
data/final_scores_mistral.pkl
data/final_scores_granite.pkl
data/final_scores_pysentimiento_emo.pkl
data/final_scores_nrc.pkl
data/final_scores_qwen.pkl
data/final_scores_liwc.pkl
data/final_scores_llama70.pkl


In [46]:
model_types = {
    'dictionary': ['liwc', 'nrc', 'vader'],
    'llm': ['mistral', 'falcon', 'llama8', 'llama70', 'olmo', 'qwen', 'granite'],
    'ml': ['cardif', 'hartmann', 'leia', 'pysentimiento_emo', 'pysentimiento_senti', 'siebert']
}

model_type_map = {}
for model_type, models in model_types.items():
    for model in models:
        model_type_map[model] = model_type

In [47]:
result['model_type'] = result['model'].map(model_type_map)

In [48]:
result.head()

Unnamed: 0,sex,level,level_value,error_type,error_value,p_value,model,model_type
0,0,sentiment,positive,valence,0.179742,1.4886150000000001e-197,vader,dictionary
1,0,sentiment,positive,salience,0.227838,0.0,vader,dictionary
2,1,sentiment,positive,valence,0.13824,1.4886150000000001e-197,vader,dictionary
3,1,sentiment,positive,salience,0.170416,0.0,vader,dictionary
4,0,sentiment,negative,valence,0.417493,0.0,vader,dictionary


In [51]:
# Results for LLMs are computed on limited sample size and is not part of preregistration
result.loc[result['model_type'] == 'llm', 'p_value'] = np.nan

# Salience error is available only for a few models
result.loc[result['error_type'] == 'salience', 'p_value'] = np.nan

In [59]:
len(set(result[result['level'] == 'mood']['level_value']))

16

In [60]:
result['significance'] = np.nan

# When p_value is not NA, compute significance based on level
mask_valid_p = result['p_value'].notna()

#Compute significance using pre-registered thresholds:

mask_sentiment = (mask_valid_p) & (result['level'] == 'sentiment')
result.loc[mask_sentiment, 'significance'] = result.loc[mask_sentiment, 'p_value'] < 0.001

mask_emotion = (mask_valid_p) & (result['level'] == 'emotion')
result.loc[mask_emotion, 'significance'] = result.loc[mask_emotion, 'p_value'] < (0.001 / 5)

mask_mood = (mask_valid_p) & (result['level'] == 'mood')
result.loc[mask_mood, 'significance'] = result.loc[mask_mood, 'p_value'] < (0.001 / 16)

  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  result.loc[mask_sentiment, 'significance'] = result.loc[mask_sentiment, 'p_value'] < 0.001


In [62]:
result.to_csv('data/errors.csv')

In [63]:
# Compute basic stat
# Removing not relevant rows and considering only one gender as p-values are the same
stats = result[(mask_valid_p) & (result['sex'] == 1)]

In [76]:
for level in ['sentiment', 'emotion', 'mood']:
    print(level)
    level_data = stats[stats['level'] == level]
    total = len(level_data)
    non_sig = len(level_data[level_data['significance'] == False])
    sign_data = level_data[level_data['significance'] == True]
    men_bias = len(sign_data[sign_data['error_value'] > 0])
    women_bias = len(sign_data[sign_data['error_value'] < 0])
    print('Men bias', men_bias, '/', total)
    print('Women bias', women_bias, '/', total)
    print('Not significant', non_sig, '/', total)
    print()

sentiment
Men bias 20 / 20
Women bias 0 / 20
Not significant 0 / 20

emotion
Men bias 44 / 50
Women bias 0 / 50
Not significant 6 / 50

mood
Men bias 115 / 160
Women bias 0 / 160
Not significant 45 / 160

