In [1]:
import numpy as np
import pickle
import pprint
import pandas as pd
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from random import shuffle
from perturbation_functions import get_preds_and_scores, calc_suff, calc_necc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
perts = pickle.load(open("Data/HateCheck_necc_suff_perturbations_3.pickle","rb"))
perts['orig_texts'] = [tt.strip(' \n') for tt in perts['orig_texts']]
perts.keys()

dict_keys(['orig_texts', 'necc_perturbed', 'suff_perturbed', 'necc_masks', 'suff_masks'])

In [3]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

datasets = ['CAD_abuse', 
            'Davidson_abuse', 
            # 'Founta_abuse',
            'CAD_hate',
            'Davidson_hate',
            # 'Founta_hate'
           ]

In [None]:

orig_preds = {}
orig_scores = {}
necc_preds = {}
necc_scores = {}
suff_preds = {}
suff_scores = {}


for dataset in datasets:
    print("Classifying HateCheck perturbations with {}.".format(dataset))
    model = BertForSequenceClassification.from_pretrained("Models/{}".format(dataset))
    model.resize_token_embeddings(len(tokenizer))
    model.eval()
    
    total_len = len(perts['orig_texts']) + sum(len(nn) for nn in perts['necc_perturbed']) + sum(len(nn) for nn in perts['suff_perturbed'])
 
    with tqdm(total=total_len) as pbar:
        orig_preds[dataset], orig_scores[dataset] = get_preds_and_scores(perts['orig_texts'], tokenizer, model, pbar)
        
        necc_preds[dataset] = []
        necc_scores[dataset] = []
    
        for tt in perts['necc_perturbed']:
            pp, ss = get_preds_and_scores(tt, tokenizer, model, pbar)
            necc_preds[dataset].append(pp)
            necc_scores[dataset].append(ss)
            
        suff_preds[dataset] = []
        suff_scores[dataset] = []
    
        for tt in perts['suff_perturbed']:
            pp, ss = get_preds_and_scores(tt, tokenizer, model, pbar)
            suff_preds[dataset].append(pp)
            suff_scores[dataset].append(ss)
            
        
final_results = {
                'orig_preds': orig_preds,
                'orig_scores': orig_scores,
                'necc_preds': necc_preds,
                'necc_scores': necc_scores,
                'suff_preds': suff_preds,
                'suff_scores': suff_scores,
                }


Classifying HateCheck perturbations with CAD_abuse.


  0%|          | 0/66240 [00:00<?, ?it/s]

Classifying HateCheck perturbations with Davidson_abuse.


  0%|          | 0/66240 [00:00<?, ?it/s]

In [None]:
pickle.dump(final_results, open("Data/HateCheck_necc_suff_preds_2.pickle", "wb"))

In [4]:
final_results = pickle.load(open("Data/HateCheck_necc_suff_preds_2.pickle","rb"))

In [5]:
with open("Data/ILM/compound_dataset/train.txt", "r") as ff:
    compound_dataset = ff.read().split("\n\n\n")
compound_dataset = [tt.strip(" :`.,") for tt in compound_dataset]
shuffle(compound_dataset)
compound_dataset = compound_dataset[:5000]

In [6]:
datasets = ['CAD_abuse', 
            'Davidson_abuse', 
            # 'Founta_abuse',
            'CAD_hate',
            'Davidson_hate',
            # 'Founta_hate'
           ]

baseline_preds = {}
baseline_scores = {}

for dataset in datasets: 
    model = BertForSequenceClassification.from_pretrained("Models/{}".format(dataset))
    model.resize_token_embeddings(len(tokenizer))
    model.eval()
    preds, scores = get_preds_and_scores(compound_dataset, tokenizer, model)
    baseline_preds[dataset] = sum(preds)/len(preds)
    baseline_scores[dataset] = sum(scores)/len(scores)

In [7]:
pickle.dump({'baseline_preds':baseline_preds, 'baseline_scores':baseline_scores}, open("Classifier_baselines_2.pickle", "wb"))

In [8]:
necc_results = {}
necc_results_nb = {}
suff_results = {}
suff_results_nb = {}

for dataset in datasets:
    
    ## NECCESSITY CALCULATIONS
    neccs = []
    for oo, pp, mm in zip(final_results['orig_preds'][dataset], 
                          final_results['necc_preds'][dataset], 
                          perts['necc_masks']):
        pp = np.array(pp)
        neccs.append(calc_necc(oo, pp, mm))
    necc_results[dataset] = neccs 
    
    neccs_nb = []
    for oo, pp, mm in zip(final_results['orig_scores'][dataset], 
                          final_results['necc_scores'][dataset], 
                          perts['necc_masks']):
        pp = np.array(pp)
        neccs_nb.append(calc_necc(oo, pp, mm))
    necc_results_nb[dataset] = neccs_nb
    
    ## SUFFICIENCY CALCULATIONS
    baseline_pred = baseline_preds[dataset]
    baseline_score = baseline_scores[dataset]
    
    suffs = []
    for pp, mm in zip(final_results['suff_preds'][dataset], perts['suff_masks']):
        pp = np.array(pp)
        suffs.append(calc_suff(baseline_pred, pp, mm))
    suff_results[dataset] = suffs 
    
    suffs_nb = []
    for pp, mm in zip(final_results['suff_scores'][dataset], perts['suff_masks']):
        pp = np.array(pp)
        suffs_nb.append(calc_suff(baseline_score, pp, mm))
    suff_results_nb[dataset] = suffs_nb     
    

In [9]:
hatecheck_necc_suff_results = {
    'necc_results': necc_results,
    'necc_results_nb': necc_results_nb,
    'suff_results': suff_results, 
    'suff_results_nb': suff_results_nb
}

pickle.dump(hatecheck_necc_suff_results, open('Data/HateCheck_necc_suff_results_all_2.pickle', 'wb'))

In [10]:
# Now get the predictions for all models for the entire hatecheck suite
hc_test_cases_all = pd.read_csv("hatecheck-data/test_suite_cases.csv")

In [11]:
print(hc_test_cases_all)

      Unnamed: 0      functionality  case_id  \
0              0  derog_neg_emote_h        1   
1              1  derog_neg_emote_h        2   
2              2  derog_neg_emote_h        3   
3              3  derog_neg_emote_h        4   
4              4  derog_neg_emote_h        5   
...          ...                ...      ...   
3723        3723       spell_leet_h     3897   
3724        3724       spell_leet_h     3898   
3725        3725       spell_leet_h     3899   
3726        3726       spell_leet_h     3900   
3727        3727       spell_leet_h     3901   

                                        test_case label_gold     target_ident  \
0                                  I hate women.     hateful            women   
1                           I hate trans people.     hateful     trans people   
2                             I hate gay people.     hateful       gay people   
3                           I hate black people.     hateful     black people   
4                 

In [12]:
hc_test_cases_all_l = hc_test_cases_all.test_case.tolist()
hc_preds = {}
hc_scores = {}
for dataset in datasets: 
    model = BertForSequenceClassification.from_pretrained("Models/{}".format(dataset))
    model.resize_token_embeddings(len(tokenizer))
    model.eval()
    preds, scores = get_preds_and_scores(hc_test_cases_all_l, tokenizer, model)
    hc_preds[dataset] = preds
    hc_scores[dataset] = scores

pickle.dump({'preds': hc_preds, 'scores':hc_scores}, open('Data/HateCheck_results_all_models_2.pickle', "wb"))

In [13]:
for dataset in datasets:
    hc_test_cases_all['{}_pred'.format(dataset)] = hc_preds[dataset]
    hc_test_cases_all['{}_score'.format(dataset)] = hc_scores[dataset]

In [14]:
pickle.dump(hc_test_cases_all, open('Data/HateCheck_templates_and_results_2.pickle', "wb"))