In [1]:
import pandas as pd
import pickle
import os
import glob
from scipy.stats import chi2_contingency
import numpy as np

def open_final(model_name):
    with open(f'data/final_scores_{model_name}.pkl', 'rb') as f:
        data = pickle.load(f)
    return data

In [2]:
df = pd.read_csv('data/dataset_extended.csv', quoting=1, escapechar='\\', doublequote=True)

## Extending dataframe by adding emotions and sentiment based on mood labels

## Computing errors
Note that total error is not equal to valence error plus salience error. If for example the actual
sentiment is negative but it is predicted to be positive in 33.3% of cases and neutral in 33.3% of cases
then the valence error is 50%, the salience error is 50%, and the total error is 66.6%.
The goal of this definition is to separate salience and valence errors.

In [3]:
def compute_valence_metrics_by_gender(non_neutral_df):
    """
    Calculate valence error rates by gender
    """
    # Initialize results
    result = {
        'valence_error_0': 0,  # Error rate for sex=0
        'valence_error_1': 0,  # Error rate for sex=1
    }
    
    if len(non_neutral_df) == 0:
        return result

    # Calculate error rates for each gender
    for i, sex_value in enumerate([0, 1]):
        sex_df = non_neutral_df[non_neutral_df['sex'] == sex_value]
        
        if len(sex_df) > 0:
            error_rate = sum(sex_df['is_valence_error']) / len(sex_df)
            result[f'valence_error_{sex_value}'] = error_rate
    
    return result

def compute_errors_by_gender(df, scores):
    """
    Compute valence errors by gender
    """
    # Filter df to only include rows where id is in scores
    valid_df = df[df['id'].isin(set(scores.keys()))].copy()
    
    # If no valid rows, return empty results
    if len(valid_df) == 0:
        return {
            'valence_error_0': 0,
            'valence_error_1': 0,
        }
    
    # Add predictions to the dataframe
    valid_df['predicted'] = valid_df['id'].map(scores)
    
    # Create flags for different types of predictions
    valid_df['is_neutral_pred'] = valid_df['predicted'] == 'neutral'
    valid_df['is_valence_error'] = (valid_df['sentiment'] != valid_df['predicted']) & (~valid_df['is_neutral_pred'])
    
    # Calculate valence metrics
    non_neutral_df = valid_df[~valid_df['is_neutral_pred']]
    valence_results = compute_valence_metrics_by_gender(non_neutral_df)
    
    return valence_results

In [4]:
def compute_group_errors(df, scores):
    # Create empty list to store results
    result_data = []
    
    level = 'sentiment'
    level_values = ['positive', 'negative']

    for level_value in level_values:
        # Filter the dataframe for this level value
        df_filtered = df[df[level] == level_value]
        
        # Skip if empty
        if df_filtered.empty:
            continue
            
        # Compute errors
        errors = compute_errors_by_gender(df_filtered, scores)
        
        # Add valence error for each gender
        for sex_value in [0, 1]:
            # Add valence error
            result_data.append({
                'sex': sex_value,
                'level': level,
                'level_value': level_value,
                'error_type': 'valence',
                'error_value': errors[f'valence_error_{sex_value}']
            })
    
    # Create DataFrame from the collected data
    result_df = pd.DataFrame(result_data)
    
    return result_df

In [5]:
model_types = {
    'dictionary': ['liwc', 'nrc', 'vader'],
    'llm': ['mistral', 'falcon', 'llama8', 'llama70', 'olmo', 'qwen', 'granite'],
    'ml': ['cardif', 'hartmann', 'leia', 'pysentimiento_emo', 'pysentimiento_senti', 'siebert']
}

model_type_map = {}
for model_type, models in model_types.items():
    for model in models:
        model_type_map[model] = model_type

In [8]:
model_files = glob.glob('data/final_scores_*.pkl')
llm_models = set(['olmo','falcon','llama8','mistral','granite','qwen','llama70'])

for i in range(20, 10001):
    subsample = df.sample(n=10000, random_state=i)
    
    result = []
    
    for model_file in model_files:
        model_name = model_file.split('final_scores_')[1].split('.pkl')[0]
        if model_name in llm_models: continue
        print(model_file)
        scores = open_final(model_name)
        
        errors = compute_group_errors(subsample, scores)
        errors['model'] = model_name
        
        result.append(errors)
    
    result = pd.concat(result, ignore_index=True)
    
    result['model_type'] = result['model'].map(model_type_map)
    
    result.to_csv(f'data/boot2/errors_{i}.csv')

data/final_scores_vader.pkl
data/final_scores_pysentimiento_senti.pkl
data/final_scores_hartmann.pkl
data/final_scores_leia.pkl
data/final_scores_cardif.pkl
data/final_scores_siebert.pkl
data/final_scores_pysentimiento_emo.pkl
data/final_scores_nrc.pkl
data/final_scores_liwc.pkl
data/final_scores_vader.pkl
data/final_scores_pysentimiento_senti.pkl
data/final_scores_hartmann.pkl
data/final_scores_leia.pkl
data/final_scores_cardif.pkl
data/final_scores_siebert.pkl
data/final_scores_pysentimiento_emo.pkl
data/final_scores_nrc.pkl
data/final_scores_liwc.pkl
data/final_scores_vader.pkl
data/final_scores_pysentimiento_senti.pkl
data/final_scores_hartmann.pkl
data/final_scores_leia.pkl
data/final_scores_cardif.pkl
data/final_scores_siebert.pkl
data/final_scores_pysentimiento_emo.pkl
data/final_scores_nrc.pkl
data/final_scores_liwc.pkl
data/final_scores_vader.pkl
data/final_scores_pysentimiento_senti.pkl
data/final_scores_hartmann.pkl
data/final_scores_leia.pkl
data/final_scores_cardif.pkl
dat

KeyboardInterrupt: 