In [1]:
'''
A (hopefully) Simple API for serving explanation score requests.

input_string = (
    f"{question} answer: {gold_label}. "
    + f" explanation: {abstr_expl}."
)

here are some example input strings:

If you feel like everything is spinning while climbing you are experiencing what? answer: vertigo. explanation: Vertigo is often experienced while climbing or at heights.
Where do you get clothes in a shopping bag? answer: retail store. explanation: For any large item where convenience is beneficial, one might go to a retail store, either a regular one or a big-box store like walmart.
Where should a cat be in a house? answer: floor. explanation: A cat should be on the floor, not on a rug.
'''
import pdb
import argparse
import torch
import transformers
import os
import tqdm
import numpy as np

_model, _tokenizer = None, None

model2url = {
    'large': 'https://storage.googleapis.com/ai2-mosaic-public/projects/few-shot-explanations/pretrained_models/commonsense_qa/valloss%3D0.28665~model%3Dt5-large~lr%3D0.0001~seed%3D1~labelagg%3D0_just_weights.pt',
    '3b': 'https://storage.googleapis.com/ai2-mosaic-public/projects/few-shot-explanations/pretrained_models/commonsense_qa/valloss%3D0.28925~model%3Dt5-3b~lr%3D0.0001~seed%3D1~labelagg%3D0_just_weights.pt',
    '11b': 'https://storage.googleapis.com/ai2-mosaic-public/projects/few-shot-explanations/pretrained_models/commonsense_qa/cose_deepspeed_valloss%3D0.00000~model%3Dt5-11b~lr%3D0.00001~seed%3D1~labelagg%3D0.pt',
}

def get_model(model_type, device=None):
    global _model, model2url
    if model_type not in {'11b', '3b', 'large'}:
        raise NotImplementedError('{} is not a valid model please use "3b" or "large"'.format(model_type))

    if _model is None:
        hf_model_name = 't5-' + model_type
        print('Loading model: this will run only once.')

        if model_type == 'large':
            model_path = 'csqa_models/t5-large.pt'
        elif model_type == '3b':
            model_path = 'csqa_models/valloss=0.28925~model=t5-3b~lr=0.0001~seed=1~labelagg=0_just_weights.pt'
        elif model_type == '11b':
            model_path = 'csqa_models/cose_deepspeed_valloss=0.00000~model=t5-11b~lr=0.00001~seed=1~labelagg=0.pt'

        if not os.path.exists(model_path):
            print('Please download weights for {} model and put in current directory.'.format(model_path))
            print('for example, wget {}'.format(model2url[model_type]))
            quit()

        state = torch.load(model_path)
        if 'model_state_dict' in state:
            state = state['model_state_dict']

        _model = transformers.AutoModelForSeq2SeqLM.from_pretrained(hf_model_name)
        if model_type == '11b': # need to resize due to deepspeed, these entires are not accessed.
            _model.resize_token_embeddings(len(transformers.AutoTokenizer.from_pretrained(hf_model_name)))
        _model.load_state_dict(state)
        _model.eval()
        if device is not None:
            _model = _model.to(device)

    return _model


def get_tokenizer(model_type):
    global _tokenizer
    if model_type not in {'3b', 'large', '11b'}:
        raise NotImplementedError('{} is not a valid model please use "3b" or "large" or "11b"'.format(model_type))

    if _tokenizer is None:
        hf_model_name = 't5-' + model_type
        _tokenizer = transformers.T5TokenizerFast.from_pretrained(hf_model_name)

    return _tokenizer


class T5Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        res = self.tokenizer(self.data[idx]['input'], truncation=True)
        res['labels'] = self.tokenizer(self.data[idx]['label']).input_ids
        return res

    def __len__(self):
        return len(self.data)


def get_scores(inputs, model_type, device=None, batch_size=32, verbose=False):
    '''
    Inputs:
      - a list of explanations to score, e.g.,:
        premise: A man getting a tattoo on his back. hypothesis: A woman is getting a tattoo. answer: contradiction. explanation: Because the tattoo artist is a man, the person getting the tattoo is not a woman.
      - model type, either "3b" or "large" or "11b"
      - device: which torch device to load model on, e.g., "cuda:3"
    Outputs:
      - P(good explanation); higher is better
    '''
    assert model_type in {'large', '3b', '11b'}

    if isinstance(inputs, str):
        inputs = [inputs]

    model = get_model(model_type, device=device)
    tokenizer = get_tokenizer(model_type)

    score_itr = T5Dataset([{'input': inp, 'label': 'x'} for inp in inputs], tokenizer) # dummy labels for inference
    data_collator = transformers.DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
#         return_tensors='pt'
    )
    score_itr = torch.utils.data.DataLoader(score_itr, shuffle=False, collate_fn=data_collator, batch_size=batch_size)
    score_itr = score_itr if not verbose else tqdm.tqdm(score_itr, total=len(score_itr))

    good_idx, bad_idx = tokenizer('good').input_ids[0], tokenizer('bad').input_ids[0]
    scores = []
    
    with torch.no_grad():
        for batch in score_itr:
            if device is not None:
                input_ids, attention_mask, targets = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            model_output = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            logits_pos = model_output['logits'][:, 0, good_idx].cpu().numpy()
            logits_neg = model_output['logits'][:, 0, bad_idx].cpu().numpy()
            exp_logit_pos, exp_logit_neg = np.exp(logits_pos), np.exp(logits_neg)
            score = list([float(x) for x in exp_logit_pos / (exp_logit_pos + exp_logit_neg)])
            #pdb.set_trace()
            scores.extend(score)
    return scores


# def parse_args():
#     '''
#     Optional args for main function, mostly just to test.
#     '''
#     parser = argparse.ArgumentParser()
#     parser.add_argument(
#         'model_type',
#         default='large',
#         choices={'large', '3b', '11b'})
#     parser.add_argument(
#         '--batch_size',
#         default=32,
#         type=int)

#     args = parser.parse_args(['--batch_size', '1'])
#     return args

In [2]:
# args = parse_args()
# parser = argparse.ArgumentParser()
# parser.add_argument(
#     'model_type',
#     default='large',
#     choices={'large', '3b', '11b'})
# parser.add_argument(
#     '--batch_size',
#     default=32,
#     type=int)

# args = parser.parse_args(["--model_type", "3b"])
# args.device = 'cpu'#'cuda' if torch.cuda.is_available() else 'cpu'
np.random.seed(1)
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# scores = get_scores(
#     ['If you feel like everything is spinning while climbing you are experiencing what? answer: vertigo. explanation: Vertigo is often experienced while climbing or at heights.',
#      'Where do you get clothes in a shopping bag? answer: retail store. explanation: For any large item where convenience is beneficial, one might go to a retail store, either a regular one or a big-box store like walmart.',
#      'Where should a cat be in a house? answer: floor. explanation: A cat should be on the floor, not on a rug.'],
#     'large',
#     device='cuda:0',
#     batch_size=1,
#     verbose=False)
# print(scores)


In [3]:
import json
from tqdm import tqdm
# with open("../../scripts/results/dev_rationale_pair.json") as f:
#     rationale_pair_dev_data = json.load(f)
import json
file_path = "../../scripts/results/48shots_cose_t5_base_authorwritten_rationales_generator_test_rationale_pair_iter.json"
with open(file_path, 'r') as f:
    rationale_pair_dev_data = json.load(f)
    

In [4]:
rationale_pair_dev_data[0].keys(), len(rationale_pair_dev_data)

(dict_keys(['id', 'question', 'choices', 'answer', 'abstractive_explanation', 'extractive_explanation', 'our_explanation', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask', 'question_encoding', 'common_expl_list', 'generated_explanation']),
 201)

In [5]:
# tokenizer = get_tokenizer('3b')
# input_list = []
# for da in rationale_pair_dev_data:
#     input_list.append(tokenizer.decode(da['input_ids']))
# input_list

In [6]:
from tqdm import tqdm
qae_list = []
score_list = []
for da in tqdm(rationale_pair_dev_data, total=len(rationale_pair_dev_data)):
    qae = "{} answer: {} explanation: {}".format(da['question'], 
                                                 da['answer'], 
                                                 da['generated_explanation'])

    scores = get_scores(
        [qae],
        '3b',
        device='cuda:0',
        batch_size=1,
        verbose=False)
    score_list.append(scores[0])
#     if scores[0] > 0.7 or scores[0] < 0.2:
#         print("question: {}".format(da['question']))
#         print("answer: {}".format(da['answer']))
#         print("common_expl_list: {}".format(da['common_expl_list']))
#         print("generated_explanation: {}".format(da['generated_explanation']))
#         print("score: {}".format(scores[0]))

  0%|                                                                                                                                                                               | 0/201 [00:00<?, ?it/s]

Loading model: this will run only once.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 201/201 [04:29<00:00,  1.34s/it]


In [7]:
score_list

[0.6884971261024475,
 0.6227219104766846,
 0.5261895060539246,
 0.27511972188949585,
 0.5329437255859375,
 0.36122140288352966,
 0.34961655735969543,
 0.36385947465896606,
 0.3343210220336914,
 0.39046651124954224,
 0.6152206063270569,
 0.5902165174484253,
 0.5245025753974915,
 0.7386172413825989,
 0.32149502635002136,
 0.5769733190536499,
 0.38756734132766724,
 0.4060823619365692,
 0.3855283856391907,
 0.5028019547462463,
 0.6197306513786316,
 0.541288435459137,
 0.5104238986968994,
 0.6357200741767883,
 0.6421171426773071,
 0.35788172483444214,
 0.5899034738540649,
 0.3866366744041443,
 0.5736709237098694,
 0.7055420875549316,
 0.2371165007352829,
 0.35953831672668457,
 0.5461113452911377,
 0.6109650135040283,
 0.6706572771072388,
 0.348745197057724,
 0.4980373680591583,
 0.5597338080406189,
 0.627579391002655,
 0.5462840795516968,
 0.7089242339134216,
 0.4219684600830078,
 0.4028230905532837,
 0.7221417427062988,
 0.5612038969993591,
 0.4468894302845001,
 0.37806111574172974,
 0.237

In [8]:
np.mean(score_list), np.median(score_list)

(0.5307908779501322, 0.5462840795516968)

## evaluate generated rationale with bert-score

In [9]:
import datasets
import numpy as np
bertscore_metric = datasets.load_metric("bertscore")
rouge_metric = datasets.load_metric('rouge')
bleu_metric = datasets.load_metric('sacrebleu')

In [10]:
import pdb

bert_scores = []
bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for da in tqdm(rationale_pair_dev_data, total=len(rationale_pair_dev_data)):
    generated_expl = da['generated_explanation']
    common_expl_list = da['common_expl_list']
    pred_expl = generated_expl.split("<extra_id_0> ")[1].split("<extra_id_1>")[0]
    list_gold_expl = [l.lower() for l in common_expl_list]
    
    bert_score = bertscore_metric.compute(predictions=[pred_expl.lower()], references=[list_gold_expl], lang="en")["f1"][0]*100
    bleu_score = bleu_metric.compute(predictions=[pred_expl.lower()], references=[list_gold_expl])['score']
    rouge_score = rouge_metric.compute(predictions=[pred_expl.lower()]*len(list_gold_expl), references=list_gold_expl)
    rouge1_score = rouge_score["rouge1"].mid.fmeasure
    rouge2_score = rouge_score["rouge2"].mid.fmeasure
    rougeL_score = rouge_score["rougeL"].mid.fmeasure
    bert_scores.append(bert_score)
    bleu_scores.append(bleu_score)
    rouge1_scores.append(rouge1_score)
    rouge2_scores.append(rouge2_score)
    rougeL_scores.append(rougeL_score)
    
    
#     #print(generated_expl)
#     #print(generated_expl.split("<extra_id_0> ")[1].split("<extra_id_1>")[0])
#     instance_bertscores = []
#     for gold_expl in list_gold_expl: 
#         score = bertscore_metric.compute(predictions=[pred_expl.lower()]*len(), references=[gold_expl.lower()], lang="en")["f1"][0]*100
#         instance_bertscores.append(score)
#     bertscores.append(np.mean(instance_bertscores))
    
#     bleuscore = bleu_score(pred_expl, list_gold_expl)
#     bleuscores.append(bleuscore)
    
#     rougescore = rouge(pred_expl, list_gold_expl)
#     rouge1_scores.append(rougescore['rouge1_fmeasure'].numpy()[0])
#     rouge2_scores.append(rougescore['rouge2_fmeasure'].numpy()[0])
#     rougeL_scores.append(rougescore['rougeL_fmeasure'].numpy()[0])
    
    


    #pdb.set_trace()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 201/201 [01:04<00:00,  3.13it/s]


In [11]:
print("bert_score: {}".format(np.mean(bert_scores)))
print("bleu_score: {}".format(np.mean(bleu_scores)))
print("rouge1_score: {}".format(np.mean(rouge1_scores)))
print("rouge2_score: {}".format(np.mean(rouge2_scores)))
print("rougeL_score: {}".format(np.mean(rougeL_scores)))


bert_score: 87.13736676458103
bleu_score: 8.279347274492993
rouge1_score: 0.24301870812973278
rouge2_score: 0.06932347196905293
rougeL_score: 0.19653377841211916


In [12]:

import numpy as np
bertscore = np.mean(bertscores)


NameError: name 'bertscores' is not defined

In [None]:
bertscore

In [None]:
bertscores

In [None]:
np.mean(score_list)