Notebook to calculate evaluation metrics for downstream models trained with different data aggregation methods

In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from scipy.stats import entropy
from numpy.linalg import norm
from sklearn.metrics import f1_score

In [2]:
# read in lists from csv file
def parse_list(value):
    if pd.isna(value) or value.strip() == "":
        return np.nan
    return literal_eval(value)

In [3]:
# calculate jensen shannon divergence
def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [None]:
## MAIN EVALUATION LOOP

# list to store evaluations
eval_data = []

# for each task
for task in ['popquorn_politeness', 'popquorn_offensiveness']:

    # read in test set
    test_df = pd.read_csv(os.path.join('../data/training_data', task + '_test.csv'))   
    test_df = test_df.drop_duplicates(subset=['instance_id']) 

    # ensure that all distributions are correctly typed as lists
    for column in [column for column in test_df.columns if "distribution" in column]:
        test_df[column] = test_df[column].apply(parse_list)

    # for each data aggregation method
    methods = ["individual", "NUTMEG", "MACE", "majority", "majority_subpop"]

    for method in methods:

        if method != "NUTMEG" and method != "majority_subpop":
            # read in predictions
            pred_df = pd.read_csv(os.path.join('../data/model_predictions', task + '_pred_' + method + '.csv'))    
            pred_df = pred_df.drop_duplicates(subset=['instance_id']).reset_index(drop=True)

            # ensure that all distributions are correctly typed as lists
            for column in [column for column in pred_df.columns[2:]]:
                pred_df[column] = pred_df[column].apply(parse_list)

        # for each subpopulation
        for subpopulation in ['education', 'gender', 'race', 'age']:
            
            # when evaluating NUTMEG, we read in predictions from different files
            if method == "NUTMEG":
                # read in predictions
                pred_df = pd.read_csv(os.path.join('../data/model_predictions', task + '_pred_' +
                                                   method + "_" + subpopulation + '.csv'))    
                pred_df = pred_df.drop_duplicates(subset=['instance_id']).reset_index(drop=True)

                # ensure that all distributions are correctly typed as lists
                for column in [column for column in pred_df.columns[2:]]:
                    pred_df[column] = pred_df[column].apply(parse_list)

            if method == "majority_subpop":
                # read in predictions
                pred_df = pd.read_csv(os.path.join('../data/model_predictions', task + '_pred_majority_' + subpopulation + '.csv'))    
                pred_df = pred_df.drop_duplicates(subset=['instance_id']).reset_index(drop=True)

                # ensure that all distributions are correctly typed as lists
                for column in [column for column in pred_df.columns[2:]]:
                    pred_df[column] = pred_df[column].apply(parse_list)

            # when evaluating unaggregated predictions, we need to melt our datgaframe
            if method == "individual":
                melted_pred_df = pred_df.melt(id_vars=["instance_id", "text"], var_name="user_id", value_name="pred").dropna()
                melted_pred_df['user_id'] = melted_pred_df['user_id'].astype('int64')
                melted_pred_df = melted_pred_df.merge(test_df[["user_id", subpopulation]], on="user_id")

                melted_pred_df = melted_pred_df.groupby(["instance_id", subpopulation])["pred"].agg(lambda x: np.mean(list(x), axis=0).tolist()).reset_index()


            # for each category in a subpopulation (e.g., Woman in gender)
            for category in test_df[subpopulation].unique():

                # identify which items have observed labels and can be fairly compared
                test_valid_pairs = test_df[category + "_distribution_label"].notna()
                pred_valid_pairs = pred_df['instance_id'].isin(test_df[test_valid_pairs]['instance_id'])

                # set the distribution for test set
                test_distributions = test_df[category + "_distribution_label"][test_valid_pairs]
            
                # set distribution for prediction set (dependent on method)
                if method == "MACE" or method == "majority":
                    # use probability output from single label
                    pred_distributions = pred_df["label"][pred_valid_pairs]
                
                elif method == "individual":
                    # filter to only the category we're currently examining
                    filtered_pred_df = melted_pred_df[melted_pred_df[subpopulation] == category].reset_index()

                    # use predicted distribution from individuals in relevant category
                    pred_distributions = filtered_pred_df['pred'][pred_valid_pairs]

                elif method == "NUTMEG":
                    # use predicted distribution for relevant category
                    pred_distributions = pred_df[category + "_label"][pred_valid_pairs]

                elif method == "majority_subpop":
                    # use predicted distribution for relevant category
                    pred_distributions = pred_df[category][pred_valid_pairs] 

                # calculate the average JSD between the predicted distribution and the observed distribution
                mean_jsd = pd.Series(map(JSD, pred_distributions, test_distributions)).mean()

                # calculate the F1 between the predicted label and the observed label (based on argmax)
                f1 = f1_score(pred_distributions.apply(np.argmax), test_distributions.apply(np.argmax))

                # add metrics to collection
                eval_data.append([task, method, subpopulation, category, mean_jsd, f1])

# combine collected data into a dataframe
eval_df = pd.DataFrame(eval_data, columns=['task', 'method', 'subpopulation', 'category', 'mean_jsd', 'f1'])

In [5]:
# save results to file
eval_df.to_csv('../results/figure_data/downstream_model_comparison_temp.csv', index=False)