This notebook contains the the results for necessity and sufficiency. Necessity and sufficiency are both calculated by either choosing a subset of tokens and perturbing them using the ILM model. The models are all BERT architecture, but trained on different datasets, and for each dataset, a model is trained on both hate/non-hate and abusive/non-abusive labels. The explanations are generated for 120 examples from the HateCheck test suite. These are instances that are explicitly hateful, and are targeted towards women or Muslims. The function ```display_scores``` displays the necessity and sufficiency for each of the examples for all models included. Note that some models will display ```NaN``` for some values. These are the cases where the model mistakenly classified the original instance as non-abusive/non-hateful. In these cases, the current necessity and sufficiency calculations aren't meaningful, because we aim to provide explanations for positive predictions only. The third argument to this function determines which necessity/sufficiency scores to display. 

In [40]:
import pickle
import pandas as pd
import numpy as np

In [None]:
preds = pickle.load(open("Data/HateCheck_necc_suff_preds.pickle", "rb"))
results = pickle.load(open("Data/HateCheck_necc_suff_results_all.pickle", "rb"))
perturbations = pickle.load(open("Data/HateCheck_necc_suff_perturbations.pickle","rb"))

In [42]:
perturbations.keys()

dict_keys(['orig_texts', 'necc_perturbed', 'suff_perturbed', 'necc_masks', 'suff_masks'])

In [43]:
preds.keys()

dict_keys(['orig_preds', 'orig_scores', 'necc_preds', 'suff_preds', 'necc_scores', 'suff_scores'])

In [44]:
results.keys()

dict_keys(['necc_results', 'suff_results'])

In [45]:
datasets = list(results['necc_results'].keys())
datasets

['CAD_abuse',
 'Davidson_abuse',
 'Founta_abuse',
 'CAD_hate',
 'Davidson_hate',
 'Founta_hate']

In [46]:
# get the corrupted examples with tokn k flipped together with the corresponding scores
# if this is sufficiency, set reverse=True so that it will return instances where token k is not flipped
def get_k_corr(k, masks, perturbed, p_results, reverse=False):
    perturbed_k = []
    for pp, mm, rr in zip(perturbed, masks[:,k], p_results):
        if mm != reverse:
            perturbed_k.append((pp, rr))
    return(perturbed_k)

In [47]:
# for a given original test case, make a table for necessity or sufficiency for each model and for each token

def display_scores(templ_n, orig_texts, orig_preds, scores_dict):
    columns = orig_texts[templ_n].strip().split()
    index = list(scores_dict.keys())
    # if the model has not predicted the original as 0, do not display the feature attribution scores
    data = np.array([scores_dict[dset][templ_n] if orig_preds[dset][templ_n] == 1 
                     else [np.nan]*len(scores_dict[dset][templ_n]) 
                    for dset in scores_dict.keys()])
    return pd.DataFrame(data=data, columns=columns, index=index)

In [48]:
display_scores(0, perturbations['orig_texts'],  preds['orig_preds'], results['necc_results'])

Unnamed: 0,I,hate,women.
CAD_abuse,0.404255,0.533333,0.921569
Davidson_abuse,0.404255,0.561905,0.852941
Founta_abuse,0.287234,0.914286,0.313725
CAD_hate,,,
Davidson_hate,,,
Founta_hate,0.319149,0.942857,0.362745


In [49]:
display_scores(0, perturbations['orig_texts'],  preds['orig_preds'], results['suff_results'])

Unnamed: 0,I,hate,women.
CAD_abuse,0.144327,0.297683,0.716302
Davidson_abuse,0.158907,0.369678,0.613475
Founta_abuse,0.215688,0.811306,0.307749
CAD_hate,,,
Davidson_hate,,,
Founta_hate,0.260288,0.882455,0.401369


In [50]:
display_scores(1, perturbations['orig_texts'],  preds['orig_preds'], results['necc_results'])

Unnamed: 0,I,hate,Muslims.
CAD_abuse,0.217822,0.320388,0.885714
Davidson_abuse,0.356436,0.601942,0.8
Founta_abuse,0.049505,0.339806,0.314286
CAD_hate,0.287129,0.349515,0.971429
Davidson_hate,0.277228,0.38835,0.866667
Founta_hate,0.069307,0.378641,0.342857


In [51]:
display_scores(1, perturbations['orig_texts'],  preds['orig_preds'], results['suff_results'])

Unnamed: 0,I,hate,Muslims.
CAD_abuse,0.230631,0.346817,0.9414
Davidson_abuse,0.109892,0.3022,0.3772
Founta_abuse,0.396738,0.851867,0.8652
CAD_hate,0.215185,0.245633,0.9448
Davidson_hate,0.206969,0.299117,0.8362
Founta_hate,0.422108,0.802717,0.8798


In [52]:
# This function is to examine the perturbations and the scores a model assigned them. 
# Will not work for masked-not-perturbed versions. 

ex_no = 0 # example in the examples list
id_token = 2  #the identity token is the _th token
model = 'CAD_abuse'
get_k_corr(id_token, 
           perturbations['suff_masks'][ex_no], 
           perturbations['suff_perturbed'][ex_no], 
           preds['suff_scores'][model][ex_no], reverse=True)

[("Not saying they're superior, just saying women.", 0.7832286357879639),
 ('Like a hate women.', 0.9623708128929138),
 ('Any show called women.', 0.04684462770819664),
 ('also unsolicited advice would be better women.', 0.9920328259468079),
 ('That woman just hate women.', 0.01588425412774086),
 ('I think I know some women who talk like women.', 0.032301489263772964),
 ('These are there are just a few small women.', 0.34678712487220764),
 ('My hate women.', 0.9910939335823059),
 ('He aint blind to women.', 0.04221954196691513),
 ('lol, i hate women.', 0.01751215010881424),
 ("I shouldn't harass women.", 0.9884856939315796),
 ('I have men as friends women.', 0.03190618008375168),
 ('But these women, hate women.', 0.9682744741439819),
 ('With bollocks women.', 0.006042133551090956),
 ('I never hate women.', 0.003054778790101409),
 ("love that guy's style, but wonder what im doing with women.",
  0.007518099155277014),
 ('All they hate women.', 0.0050916303880512714),
 (' She had women.'

In [53]:
ex_no = 0 # example in the examples list
id_token = 2  #the identity token is the _th token
model = 'CAD_abuse'
get_k_corr(id_token, 
           perturbations['necc_masks'][ex_no], 
           perturbations['necc_perturbed'][ex_no], 
           preds['necc_scores'][model][ex_no])

[('I hate you but I will always', 0.009788931347429752),
 ('I hate everyone but Louie', 0.01588425412774086),
 ('I hope we can agree as', 0.02469812147319317),
 ('Nothing is more shocking than what you hate on the internet',
  0.0021576343569904566),
 ('I hate that!', 0.007768529932945967),
 ('refused to hate redirect for me', 0.004168317653238773),
 ('I hate government leadership w/ batman.', 0.01751215010881424),
 ('I think of it as a trivia question!', 0.0028839425649493933),
 ('I know what you mean.', 0.07794643193483353),
 ('I hate when my favorite heroes use generic villains.',
  0.002555097918957472),
 ('I am very pro-russian.', 0.03190618008375168),
 ('I hate Jesus Christ', 0.006042133551090956),
 ('I hate spoilers.', 0.003054778790101409),
 ("I'm watching this man with his ponytail.", 0.007518099155277014),
 ('I hate it.', 0.011801403015851974),
 ('I hate seeing Pozner as the general manager of a team',
  0.010137147270143032),
 ('I hate dogs...', 0.0076780072413384914),
 ('I 

In the following section, there is the average prediction of the models for the cases where identities are mentioned in neutral or positive contexts (```ident_neutral_nh```, ```ident_pos_nh```) and the test cases where there is "abuse" directed to non-protected groups (```target_group_nh```) individuals (```target_indiv_nh```) and objects (```target_obj_nh```). The performance of the models on non-abusive mention identities for the two targets we've chosen for our explanations ('women' and 'Muslims') is also given seperately. 

In [54]:
hc_results = pickle.load(open('Data/intermediate outputs/HateCheck_templates_and_results.pickle', "rb"))
hc_results.test_case = hc_results.test_case.apply(lambda x: x.strip())
hc_results

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,...,Davidson_abuse_pred,Davidson_abuse_score,Founta_abuse_pred,Founta_abuse_score,CAD_hate_pred,CAD_hate_score,Davidson_hate_pred,Davidson_hate_score,Founta_hate_pred,Founta_hate_score
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,...,1,0.987005,1,0.989412,0,0.477585,0,0.083175,1,0.990185
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,...,1,0.865187,1,0.986793,1,0.523131,1,0.994292,1,0.975954
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,...,1,0.967316,1,0.991046,1,0.985266,1,0.993939,1,0.990166
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,...,1,0.909598,1,0.991238,1,0.986397,1,0.994157,1,0.975602
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,...,0,0.121863,1,0.989154,0,0.002243,1,0.994348,1,0.988610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,hateful,gay people,general,4ssholes,asshole,1232.0,256.0,...,1,0.989414,1,0.984351,1,0.991244,1,0.967804,1,0.975607
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,hateful,black people,general,4ssholes,asshole,1233.0,256.0,...,1,0.985969,1,0.977180,1,0.974721,0,0.250225,1,0.975588
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,hateful,disabled people,general,4ssholes,asshole,1234.0,256.0,...,1,0.956798,0,0.007620,0,0.043165,0,0.264191,0,0.014212
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,hateful,Muslims,general,4ssholes,asshole,1235.0,256.0,...,1,0.976569,1,0.978502,1,0.990368,1,0.974557,1,0.959037


In [55]:
hc_results_women_nh = hc_results[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'women')]

hc_results_muslims_nh = hc_results[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'Muslim')]

In [56]:
hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'women')] = 'women_nh'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [57]:
hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'Muslims')] = 'muslims_nh'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [58]:
# the results we are interested are: 
target_funcs = ['women_nh', 'muslims_nh', 'target_obj_nh', 'target_indiv_nh', 'target_group_nh']

target_funcs_results = hc_results[hc_results.functionality.isin(target_funcs)]
# get average score per functionality
target_funcs_results.groupby('functionality')[['{}_pred'.format(dd) for dd in datasets]].mean().transpose()

functionality,muslims_nh,target_group_nh,target_indiv_nh,target_obj_nh,women_nh
CAD_abuse_pred,0.955556,0.403226,0.615385,0.0,0.422222
Davidson_abuse_pred,0.222222,0.258065,0.276923,0.138462,0.311111
Founta_abuse_pred,0.777778,0.451613,0.723077,0.369231,0.022222
CAD_hate_pred,0.822222,0.016129,0.0,0.0,0.355556
Davidson_hate_pred,0.777778,0.370968,0.184615,0.015385,0.022222
Founta_hate_pred,0.777778,0.193548,0.153846,0.046154,0.022222


In [59]:
mask_results = pickle.load(open('Data/intermediate outputs/HateCheck_necc_suff_results_masked.pickle', 'rb'))
mask_results.keys()

dict_keys(['necc_results', 'necc_results_nb', 'suff_results', 'suff_results_nb'])

In [60]:
necc_vals = {}
suff_vals = {}
necc_vals_mask = {}
suff_vals_mask = {}
orig_texts = []
targets = []

for tt in perturbations['orig_texts']:
    orig_text = tt.strip()
    row = hc_results[hc_results.test_case == orig_text]
    targets.append(row.target_ident.tolist()[0])

for dataset in datasets:
    necc_vals[dataset] = []
    suff_vals[dataset] = []
    necc_vals_mask[dataset] = []
    suff_vals_mask[dataset] = []
    for nn, (orig_text, orig_pred) in enumerate(zip(perturbations['orig_texts'], preds['orig_preds'][dataset])):
        if orig_pred != 1:
            necc_vals[dataset].append(np.nan)
            suff_vals[dataset].append(np.nan)
            necc_vals_mask[dataset].append(np.nan)
            suff_vals_mask[dataset].append(np.nan)
            continue
        # get the row in hc_results corresponding to this case
        orig_text = orig_text.strip()
        row = hc_results[hc_results.test_case == orig_text]
        toknd = row.case_templ.tolist()[0].split()
        ## find the index of the template placeholder
        for ii, tt in enumerate(toknd):
            if tt[:1] == "[":
                break
        necc_vals[dataset].append(results['necc_results'][dataset][nn][ii])
        suff_vals[dataset].append(results['suff_results'][dataset][nn][ii])
        necc_vals_mask[dataset].append(mask_results['necc_results_nb'][dataset][nn][ii])
        suff_vals_mask[dataset].append(mask_results['suff_results_nb'][dataset][nn][ii])

df_dict = {('necessity', dd): ll for dd, ll in necc_vals.items()}
df_dict.update({('sufficiency', dd): ll for dd, ll in suff_vals.items()})
df_dict.update({('necessity_mask', dd): ll for dd, ll in necc_vals_mask.items()})
df_dict.update({('sufficiency_mask', dd): ll for dd, ll in suff_vals_mask.items()})
df_dict.update({('prediction', dd): ll for dd, ll in preds['orig_preds'].items()})
df_dict.update({('score', dd): ll for dd, ll in preds['orig_scores'].items()})
#df_dict.update({'target', ''}: targets)

#ind = [xx.strip() for xx in perturbations['orig_texts']]
ind = [(tt, xx.strip()) for xx, tt in zip(perturbations['orig_texts'], targets)]

# pd.DataFrame(df_dict, index=ind)
#     avg_necc[dataset] = {target: np.mean(necc_vals[target]) for target in targets}
#     avg_suff[dataset] = {target: np.mean(suff_vals[target]) for target in targets}

master_df = pd.DataFrame(df_dict, index=ind)
master_df.columns = pd.MultiIndex.from_tuples(master_df.columns, names=['value','Dataset'])
master_df.index = pd.MultiIndex.from_tuples(master_df.index, names=['target', 'text'])
pickle.dump(master_df, open("Data/HateCheck_individual_necc_suff_scores.pickle", "wb"))

# master_df.xs('CAD_abuse', level='Dataset', axis=1)
# master_df['necessity']
# master_df.loc['women']
# master_df.xs('I hate women.', level='text')

In [61]:
master_df = pickle.load(open("Data/HateCheck_individual_necc_suff_scores.pickle", "rb"))

In [62]:
master_df['necessity'].groupby(level='target').mean().transpose()

target,Muslims,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
CAD_abuse,0.824126,0.825619
Davidson_abuse,0.836019,0.823236
Founta_abuse,0.650831,0.544383
CAD_hate,0.965085,0.960819
Davidson_hate,0.908885,0.584843
Founta_hate,0.888404,0.819972


In [63]:
master_df['necessity'].groupby(level='target').std().transpose()

target,Muslims,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
CAD_abuse,0.147089,0.135664
Davidson_abuse,0.132284,0.136133
Founta_abuse,0.212946,0.169301
CAD_hate,0.031214,0.023952
Davidson_hate,0.123177,0.089595
Founta_hate,0.159284,0.182587


In [64]:
master_df['sufficiency'].groupby(level='target').mean().transpose()

target,Muslims,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
CAD_abuse,0.883638,0.64441
Davidson_abuse,0.40808,0.439905
Founta_abuse,0.823165,0.343123
CAD_hate,0.878019,0.706071
Davidson_hate,0.738724,0.213942
Founta_hate,0.813537,0.295489


In [65]:
master_df['sufficiency'].groupby(level='target').std().transpose()

target,Muslims,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
CAD_abuse,0.073014,0.138826
Davidson_abuse,0.138387,0.125232
Founta_abuse,0.059369,0.102189
CAD_hate,0.13258,0.173075
Davidson_hate,0.091162,0.061001
Founta_hate,0.077387,0.104272


In [66]:
master_df['necessity_mask'].groupby(level='target').mean().transpose()

target,Muslims,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
CAD_abuse,0.643937,0.615021
Davidson_abuse,0.522215,0.552777
Founta_abuse,0.362422,0.192982
CAD_hate,0.928238,0.874405
Davidson_hate,0.88264,0.436204
Founta_hate,0.724388,0.52969


In [67]:
master_df['sufficiency_mask'].groupby(level='target').mean().transpose()

target,Muslims,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
CAD_abuse,0.945313,0.859666
Davidson_abuse,0.749977,0.796173
Founta_abuse,0.950145,0.580692
CAD_hate,0.945048,0.882524
Davidson_hate,0.917194,0.257918
Founta_hate,0.909181,0.574927
