In [1]:
import wandb
wandb.login()

%env WANDB_PROJECT=evaluate_LM_with_rationalization

[34m[1mwandb[0m: Currently logged in as: [33mdengdenghuang[0m ([33mcuhk_lavilab[0m). Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=evaluate_LM_with_rationalization


In [2]:
# import gpt3
import logging
import math
import os

from typing import List, Dict, Any, NewType

InputDataClass = NewType("InputDataClass", Any)
os.environ["CUDA_VISIBLE_DEVICES"] = ""
from transformers import (
    T5Config,
    T5ForConditionalGeneration,
    T5Tokenizer,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
    EarlyStoppingCallback
)
from transformers.trainer_utils import EvaluationStrategy
from transformers.integrations import TensorBoardCallback
import transformers
from transformers import Trainer

#from feature_conversion_methods import format_instance

from custom_args import (
    DataTrainingArguments,
    ModelArguments
)
from metrics import evaluate
import torch
import datasets
import git
import time
from datetime import datetime
import sys
from tqdm import trange
import random 
import pandas as pd 
import jsonlines
from copy import deepcopy 

logger = logging.getLogger(__name__)
transformers.logging.set_verbosity_info()
import re
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    """
    Override logging levels of different modules based on their name as a prefix.
    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.

    Args:
        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
        - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
          Default is `[""]` to match all active loggers.
          The match is a case-sensitive `module_name.startswith(prefix)`
    """
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)
set_global_logging_level(logging.ERROR, ["datasets"])


CONFIG_MAPPING = {"t5": T5Config}
MODEL_MAPPING = {"t5": T5ForConditionalGeneration}
TOKENIZER_MAPPING = {"t5": T5Tokenizer}


def set_other_seeds(seed):
    torch.backends.cudnn.benchmark = False
    #torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed)

# inspired by DefaultDataCollator from:
# https://github.com/huggingface/transformers/blob/master/src/transformers/data/data_collator.py
# modified to perform batch-level padding.
class SequenceCollator:
    def __init__(self, model, pad_token):
        self.model = model
        self.pad_token_mapping = {
            "labels": -100,
            "attention_mask": 0,
            "decoder_attention_mask": 0,
            "input_ids": pad_token,
        }

        self.columns = [
            "input_ids",
            "attention_mask",
            "labels",
            "decoder_attention_mask",
        ]

    def __call__(self, examples: List[Dict[str, InputDataClass]]) -> Dict[str, torch.Tensor]:
        # re-format inputs for training
        batch = {}
        for key in examples[0].keys():
            if key in self.columns:
                tmp_list = []
                for item in examples:
                    tmp_list.append(item[key])

                # pad lists to max length
                if isinstance(tmp_list[0], list):
                    max_length = max(map(len, tmp_list))
                    tmp_list = [
                        el + [self.pad_token_mapping[key]] * (max_length - len(el))
                        for el in tmp_list
                    ]

                batch[key] = torch.tensor(tmp_list, dtype=torch.long)
        return batch


In [3]:
from collections import defaultdict
import random
"""
Example-to-Feature conversion methods
Modified from
https://github.com/salesforce/cos-e/blob/master/code/generation/train_commonsenseqa_v1.0.py and ""_v1.11.py (identical)
as well as Tensorflow code for WTF?: 
https://github.com/google-research/google-research/blob/master/wt5/wt5/preprocessors.py
"""
# This code is based on https://github.com/allenai/label_rationale_association/blob/main/feature_conversion_methods.py

unified_qa_esnli_label_mapping = {0: 'yes', 1: 'maybe', 2: 'no'}
unified_qa_esnli_label_mapping_upper = {0: 'Yes', 1: 'Maybe', 2: 'No'} 
wt5_esnli_label_mapping = {0: 'entailment', 1: 'neutral', 2: 'contradiction'} 
unified_qa_sbic_label_mapping = {"offensive": 'Yes', "not offensive": 'No'}

def format_instance(
        example,
        tokenizer,
        explanation_sep,
        max_seq_length=None,
        datasource=None,
        io_format=None, 
):
    assert datasource in {"cos_e", "esnli", "sbic", "sensemaking", "ecqa"}

    if datasource in ["cos_e", "ecqa"]:
        input_string, answer_string = cqa_formatting(example, io_format, explanation_sep, datasource)
    elif datasource == "esnli":
        input_string, answer_string = esnli_formatting(example, io_format, explanation_sep)
    elif datasource == 'sbic':
        input_string, answer_string = sbic_formatting(example, io_format, explanation_sep)
    elif datasource == 'sensemaking':
        input_string, answer_string = sensemaking_formatting(example, io_format, explanation_sep)
    else:
        raise ValueError("Unknown task. Currently supported: esnli, cos_e, sbic, sensemaking, ecqa.")
    
    if 'unified' in io_format and 'unifew' not in io_format:
        input_string += '</s>'

    input_string = ' '.join(input_string.split())
    answer_string = ' '.join(answer_string.split())

    input_string = ' '.join(input_string.split())
    answer_string = ' '.join(answer_string.split())

    encodings = tokenizer.encode_plus(
        input_string,
        max_length=max_seq_length,
        pad_to_max_length=False,
        return_token_type_ids=False,
        return_attention_mask=True,
    )


    # note even with "lm_labels.shift_right()", the decoder attention mask length is still correct since we remove the last token
    dec = tokenizer.encode_plus(
        answer_string,
        max_length=max_seq_length,
        pad_to_max_length=False,
        return_token_type_ids=False,
        return_attention_mask=True,
    )

    encodings["labels"] = dec["input_ids"]
    encodings["decoder_attention_mask"] = dec["attention_mask"]
    encodings["question_encoding"] = encodings["input_ids"]

    #return encodings
    return {**example, **encodings}

#这里很简单 定义好输入输出string就可以的
def cqa_formatting(item, io_format, explanation_sep, datasource):
    question = item["question"]
    answer = item["answer"]
    abstr_expl = item["abstractive_explanation"].lower() if datasource == 'cos_e' else item["explanation"].lower()


    if io_format == 't5_fewshot_infilling_with_choices':
        input_string = f"explain {datasource} question: {question} choice: " + " choice: ".join(item["choices"]) + f" <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == 't5_fewshot_infilling_more_natural':
        input_string = f"explain {datasource} question: {question} choice: " + " choice: ".join(item["choices"]) + f" The answer is <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == "squad": 
        input_string = f"explain {datasource} question: {question} context: " + ', '.join(item['choices']) # explain cos_e question: When getting in shape you need to have this in between workouts? context: give up, period of recovery, jogging
        answer_string = f"{answer} {explanation_sep} {abstr_expl}" # period of recovery because without a period of recovery you will not get any gains.
    elif io_format == "record": 
        # might not work because cos_e doesn't have a passage 
        input_string = f"explain {datasource} query: {question} entities: " + ', '.join(item['choices']) # explain cos_e query: When getting in shape you need to have this in between workouts? entities: give up, period of recovery, jogging
        answer_string = f"{answer} {explanation_sep} {abstr_expl}" # period of recovery because without a period of recovery you will not get any gains.
    elif io_format == 'unifiedqa_matching':
        choice_ids = ['(A)', '(B)', '(C)', '(D)', '(E)']
        input_string = f'explain {question.lower()} \\n'
        for choice_id, choice in zip(choice_ids, item["choices"]):
            input_string += f' {choice_id} {choice.lower()}'
        answer_string = f"{answer.lower()} {explanation_sep} {abstr_expl.lower()}"
        answer_string = answer_string.lower()
    elif io_format == 't5_fewshot_infilling_without_choices_use_refined_expl':
        input_string = f"explain {datasource} question: {question} choice: " + " choice: ".join(item["choices"]) + f" <extra_id_0> {explanation_sep} <extra_id_1>"
        input_string = f"explain {datasource} question: {question} answer: {answer}" + f" {explanation_sep} <extra_id_0>"
        answer_string = f"<extra_id_0> {item['our_explanation']} <extra_id_1>"

    else:
        raise ValueError("The IO format is not supported. Choose `standard` or `masked_cause_generate`.")
    
    return input_string, answer_string


def esnli_formatting(item, io_format, explanation_sep):

    premise = item["premise"]
    hypothesis = item["hypothesis"]
    answer = unified_qa_esnli_label_mapping[item["label"]] if 'unified' in io_format else wt5_esnli_label_mapping[item["label"]]
    abstr_expl = item["explanation_1"].lower() 
    # Dev/test instances have more than one explanation annotated; merge them into one sequence separated by [SEP] 
    for k in [2,3]:
        if f"explanation_{k}" in item and item[f'explanation_{k}']!='': 
            abstr_expl += f" [SEP] {item[f'explanation_{k}'].lower()}"

    if io_format == 'standard':
        input_string = f"explain nli hypothesis: {hypothesis} premise: {premise}"
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"
    elif io_format == 't5_fewshot_infilling':
        input_string = f"explain nli hypothesis: {hypothesis} premise: {premise} <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == 't5_fewshot_infilling_more_natural':
        input_string = f"explain nli hypothesis: {hypothesis} premise: {premise} This is <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == "squad": 
        input_string = f"explain nli question: Is this entailment? context: {hypothesis} {premise}"  
        answer_ynm = unified_qa_esnli_label_mapping[item["label"]]
        answer_string = f"{answer_ynm} {explanation_sep} {abstr_expl}" 
    elif io_format == "squad_endswith_what":
        input_string = f"explain nli question: What is this? context: {hypothesis} {premise}"  
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"  
    elif io_format == "squad_nli_mix": 
        input_string = f"explain nli question: Is this entailment? context: hypothesis: {hypothesis} premise: {premise}"  
        answer_ynm = unified_qa_esnli_label_mapping[item["label"]]
        answer_string = f"{answer_ynm} {explanation_sep} {abstr_expl}"  
    elif io_format == "squad_nli_mix_endswith_what":  
        input_string = f"explain nli question: What is this? context: hypothesis: {hypothesis} premise: {premise}"  
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"   
    elif io_format == 'unifiedqa_unifew':
        hypothesis = hypothesis.lower().rstrip('.')
        unified_qa_esnli_label_mapping_upper = {0: 'Yes', 1: 'Maybe', 2: 'No'}
        answer = unified_qa_esnli_label_mapping_upper[item["label"]]
        input_string = f'explain {premise} Is {hypothesis}? \\n (A) Yes (B) Maybe (C) No'
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"  
    elif io_format == 'unifiedqa_unifew_nli_mix':
        premise = premise.lower().rstrip('.')
        unified_qa_esnli_label_mapping_upper = {0: 'Yes', 1: 'Maybe', 2: 'No'}
        input_string = f'explain hypothesis: {hypothesis} Is premise: {premise}? \\n (A) Yes (B) Maybe (C) No'
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"  
    elif io_format == 'unifiedqa_ynm': 
        input_string = f'explain is this entailment? \\n {hypothesis.lower()} {premise.lower()}'  
        answer = unified_qa_esnli_label_mapping[item["label"]]
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unifiedqa_snli_mix_ynm': 
        input_string = f'explain is this entailment? \\n hypothesis: {hypothesis.lower()} premise: {premise.lower()}' 
        answer = unified_qa_esnli_label_mapping[item["label"]]
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unifiedqa_snli_mix_ynm_with_choices': 
        input_string = f'explain is this entailment? \\n (A) yes (B) maybe (C) no \\n hypothesis: {hypothesis.lower()} premise: {premise.lower()}'  
        answer = unified_qa_esnli_label_mapping[item["label"]]
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unifiedqa_what_v2': 
        input_string = f'explain what is this? \\n {hypothesis.lower()} {premise.lower()}'  
        answer = wt5_esnli_label_mapping[item["label"]]
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unifiedqa_snli_mix_what_v2': 
        input_string = f'explain what is this? \\n hypothesis: {hypothesis.lower()} premise: {premise.lower()}'  
        answer = wt5_esnli_label_mapping[item["label"]]
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unifiedqa_snli_mix_what_with_choices_v2': 
        input_string = f'explain what is this? \\n (A) entailment (B) neutral (C) contradiction \\n hypothesis: {hypothesis.lower()} premise: {premise.lower()}'  
        answer = wt5_esnli_label_mapping[item["label"]]
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"     
    else:
        raise ValueError("The IO format is not supported.")

    return input_string, answer_string


def sbic_formatting(item, io_format, explanation_sep):
    # We pre-processed the SBIC dataset such that we join multiple implied statements with the [SEP] token for dev/test instances 
    # Each annotation in the training split is a separate instance
    post = item['post']
    answer = unified_qa_sbic_label_mapping[item["offensiveYN"]] if 'unified' in io_format else item["offensiveYN"].replace("not offensive", "not_offensive")
    abstr_expl = item["targetStereotype"]
    

    if io_format == 'cola':
        input_string = f"explain sbic sentence: {post}"
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"  
    elif io_format == 't5_fewshot_infilling':
        input_string = f"explain sbic sentence: {post} <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == 't5_fewshot_infilling_bool':
        answer_bool = 'True' if item["offensiveYN"] == 'offensive' else 'False'
        input_string = f"explain sbic sentence: {post} It is <extra_id_0> that this is offensive {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == 't5_fewshot_infilling_more_natural':
        input_string = f"explain sbic sentence: {post} This is <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer} <extra_id_1> {abstr_expl} <extra_id_2>"
    elif io_format == 'squad_yn': 
        input_string = f"explain sbic question: Is this offensive? context: {post}" # explain sbic question: Is this offensive? context: It's only happened to you like two times.
        answer_yn = unified_qa_sbic_label_mapping[item["offensiveYN"]] 
        answer_string = f"{answer_yn} {explanation_sep} {abstr_expl}" #  No because this post does not imply anything offensive
    elif io_format == 'squad_yn_with_tags': 
        input_string = f"explain sbic question: Is this offensive? context: post: {post}" # explain sbic question: Is this offensive? context: It's only happened to you like two times.
        answer_yn = unified_qa_sbic_label_mapping[item["offensiveYN"]] 
        answer_string = f"{answer_yn} {explanation_sep} {abstr_expl}" #  No because this post does not imply anything offensive
    elif io_format == 'squad_what': 
        input_string = f"explain sbic question: What is this? context: {post}" # explain sbic question: This is what? context: It's only happened to you like two times.
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")        
        answer_string = f"{answer} {explanation_sep} {abstr_expl}" # not_offensive because this post does not imply anything offensive    
    elif io_format == 'squad_what_with_tags': 
        input_string = f"explain sbic question: What is this? context: post: {post}" # explain sbic question: This is what? context: It's only happened to you like two times.
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")        
        answer_string = f"{answer} {explanation_sep} {abstr_expl}" # not_offensive because this post does not imply anything offensive    
    elif io_format == 'unified_qa_yn':
        input_string = f'explain is this offensive? \\n {post.lower()}'
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}" 
    elif io_format == 'unified_qa_yn_with_tags':
        input_string = f'explain is this offensive? \\n post: {post.lower()}'
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unified_qa_yn_with_choices':
        input_string = f'explain is this offensive? \\n (A) yes (B) no \\n {post.lower()}'
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"
    elif io_format == 'unified_qa_yn_with_choices_and_tags':
        input_string = f'explain is this offensive? \\n (A) yes (B) no \\n post: {post.lower()}'
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unified_qa_what':
        input_string = f'explain what is this? \\n {post.lower()}'
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unified_qa_what_with_tags':
        input_string = f'explain what is this? \\n post: {post.lower()}'
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unified_qa_what_with_choices':
        input_string = f'explain what is this? \\n (A) offensive (B) not_offensive \\n {post.lower()}'
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"
    elif io_format == 'unified_qa_what_with_choices_and_tags':
        input_string = f'explain what is this? \\n (A) offensive (B) not_offensive \\n post: {post.lower()}'
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")
        answer_string = f"{answer} {explanation_sep} {abstr_expl.lower()}"  
    elif io_format == 'unifiedqa_unifew':
        input_string = f"Topic? \\n (A) offensive (B) not_offensive \\n {post}"
        answer = item["offensiveYN"].replace("not offensive", "not_offensive")
        answer_string = f"{answer} {explanation_sep} {abstr_expl}"
    else:
        raise ValueError("The IO format is not supported. Choose `standard` or `masked_cause_generate`.")

    input_string = ' '.join(input_string.split())
    answer_string = ' '.join(answer_string.split())
    return input_string, answer_string

def sensemaking_formatting(item, io_format, explanation_sep):
    # TODO: explore whether removing periods makes difference? 
    sent0 = item['sent0']
    sent1 = item['sent1']
    nonsensical_sentence = str(int(item['label'])+1)
    explanation = item['explanation'].lower()

    if io_format == 'copa_with_question':
        input_string = f"explain sensemaking choice1: {sent0} choice2: {sent1} question: nonsensical"
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation}"
    elif io_format == 'copa_bool':  
        answer_bool = str(bool(int(item['label']))) # True if choice2 is more nonsensical    
        input_string = f"explain sensemaking choice1: {sent0} choice2: {sent1} Less common is choice2"
        answer_string = f"{answer_bool} {explanation_sep} {explanation}"
    elif io_format == 't5_fewshot_infilling':  
        input_string = f"explain sensemaking choice1: {sent0} choice2: {sent1} <extra_id_0> {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> choice{nonsensical_sentence} <extra_id_1> {explanation} <extra_id_2>"
    elif io_format == 't5_fewshot_infilling_bool':  
        answer_bool = str(bool(int(item['label']))) # True if choice2 is more nonsensical    
        input_string = f"explain sensemaking choice1: {sent0} choice2: {sent1} It is <extra_id_0> that choice2 is less common {explanation_sep} <extra_id_1>"
        answer_string = f"<extra_id_0> {answer_bool} <extra_id_1> {explanation} <extra_id_2>"
    elif io_format == "squad_yn": 
        input_string = f"explain sensemaking question: Is choice2 more nonsensical? context: choice1: {sent0} choice2: {sent1}" # explain sensemaking question: What is nonsensical, choice1 or choice2? context: choice1: All state flowers are the scarlet carnation. choice2: The New Jersey state flower is the scarlet carnation
        answer = "Yes" if bool(int(item['label'])) else "No"
        answer_string = f"{answer} {explanation_sep} {explanation}" #  choice1 because state flowers are unique to each state.  
    elif io_format == "squad_yn_no_tags": 
        input_string = f"explain sensemaking question: Is choice2 more nonsensical? context: {sent0} {sent1}" # explain sensemaking question: What is nonsensical, choice1 or choice2? context: choice1: All state flowers are the scarlet carnation. choice2: The New Jersey state flower is the scarlet carnation
        answer = "Yes" if bool(int(item['label'])) else "No"
        answer_string = f"{answer} {explanation_sep} {explanation}" #  choice1 because state flowers are unique to each state.  
    elif io_format == "squad_what": 
        input_string = f"explain sensemaking question: What is more nonsensical? context: choice1: {sent0} choice2: {sent1}" # explain sensemaking question: What is nonsensical, choice1 or choice2? context: choice1: All state flowers are the scarlet carnation. choice2: The New Jersey state flower is the scarlet carnation
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation}" #  choice1 because state flowers are unique to each state.  
    elif io_format == "squad_what_no_tags": 
        input_string = f"explain sensemaking question: What is more nonsensical? context: {sent0} {sent1}" # explain sensemaking question: What is nonsensical, choice1 or choice2? context: choice1: All state flowers are the scarlet carnation. choice2: The New Jersey state flower is the scarlet carnation
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation}" #  choice1 because state flowers are unique to each state.  
    elif io_format == "record": 
        input_string = f"explain sensemaking query: What is more nonsensical? entities: choice1, choice2 passage: choice1: {sent0} choice2: {sent1}" # explain sensemaking query: What is nonsensical? entities: choice1, choice2 passage: choice1: All state flowers are the scarlet carnation. choice2: The New Jersey state flower is the scarlet carnation.
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation}" # choice1 because state flowers are unique to each state.
    elif io_format == 'unifiedqa_yn_with_choices':
        answer = "yes" if bool(int(item['label'])) else "no"
        input_string = f'explain is choice2 more nonsensical? \\n (A) yes (B) no \\n choice1: {sent0.lower()} choice2: {sent1.lower()}'
        answer_string = f"{answer} {explanation_sep} {explanation.lower()}" 
    elif io_format == 'unifiedqa_yn':
        answer = "yes" if bool(int(item['label'])) else "no"
        input_string = f'explain is choice2 more nonsensical? \\n choice1: {sent0.lower()} choice2: {sent1.lower()}'
        answer_string = f"{answer} {explanation_sep} {explanation.lower()}"  
    elif io_format == 'unifiedqa_yn_no_tags':
        answer = "yes" if bool(int(item['label'])) else "no"
        input_string = f'explain is choice2 more nonsensical? \\n {sent0.lower()} {sent1.lower()}'
        answer_string = f"{answer} {explanation_sep} {explanation.lower()}"  
    elif io_format == 'unifiedqa_what_with_choices':
        nonsensical_sentence = str(int(item['label'])+1)
        input_string = f'explain what is more nonsensical? \\n (A) choice1 (B) choice2 \\n choice1: {sent0.lower()} choice2: {sent1.lower()}'
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation.lower()}"  # use " BECAUSE "
    elif io_format == 'unifiedqa_what':
        nonsensical_sentence = str(int(item['label'])+1)
        input_string = f'explain what is more nonsensical? \\n choice1: {sent0.lower()} choice2: {sent1.lower()}'
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation.lower()}"  # use " BECAUSE "
    elif io_format == 'unifiedqa_what_no_tags':
        nonsensical_sentence = str(int(item['label'])+1)
        input_string = f'explain what is more nonsensical? \\n {sent0.lower()} {sent1.lower()}'
        answer_string = f"choice{nonsensical_sentence} {explanation_sep} {explanation.lower()}"  # use " BECAUSE "


    input_string = ' '.join(input_string.split())
    answer_string = ' '.join(answer_string.split())
    return input_string, answer_string

In [4]:
og_start_time = time.time()

#parser = HfArgumentParser(
#    (ModelArguments, DataTrainingArguments, TrainingArguments)
#)
parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, TrainingArguments)
)

model_args, data_args, training_args, unused_args = parser.parse_args_into_dataclasses(
    ["--model_type", "t5-3b",
     "--tokenizer_name", "t5-3b",
     "--task_name", "cos_e", 
     "--output_dir", "./cos_e_output_t5_3b", 
     "--n_shots", "10",
     "--do_train", "True"], return_remaining_strings=True)
if unused_args != []:
    raise ValueError(f"Received unused arguments: {unused_args}")
# make sure only one dataset split pick if manually specifying evaluation file

if model_args.use_gpt3:
    assert training_args.do_train
    assert not training_args.do_eval
    assert data_args.generations_filepath is None
    if data_args.gpt3_max_eval_size is not None:
        assert data_args.gpt3_max_eval_size <= data_args.fewshot_eval_size
        assert data_args.gpt3_max_eval_size % 2 == 0
        assert data_args.gpt3_max_eval_size % 3 == 0

if data_args.generations_filepath is not None:
    training_args.do_train = False
    training_args.do_eval = False
    if "train" in data_args.generations_filepath:
        data_args.train_predict = True
        data_args.test_predict = False
        data_args.dev_predict = False
    elif "test" in data_args.generations_filepath:
        data_args.train_predict = False
        data_args.test_predict = True
        data_args.dev_predict = False
    elif "validation" in data_args.generations_filepath:
        data_args.train_predict = False
        data_args.test_predict = False
        data_args.dev_predict = True

if not training_args.do_train and data_args.generations_filepath is None:
    if not model_args.pretrained_model_file:
        raise Exception(
            "if not training a model from scratch, must specify a trained model to load for evaluation"
        )

if training_args.do_train:
    # create a save directory and a logfile
    training_args.output_dir = os.path.join(
        training_args.output_dir, datetime.now().strftime("%m%d%y_%H%M%S")
    )
    training_args.logging_dir = training_args.output_dir
    assert not os.path.exists(training_args.output_dir)
    os.makedirs(training_args.output_dir)

    if (
            os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir)
            and training_args.do_train
            and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )
    handlers = [
        logging.FileHandler(os.path.join(training_args.output_dir, "logger.log")),
        logging.StreamHandler(),
    ]
else:
    # don't overwrite existing logfile or create new directory
    training_args.output_dir = model_args.pretrained_model_file
    handlers = [logging.StreamHandler()]

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    handlers=handlers,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Save path: %s" % training_args.output_dir)

# get git hash and branch where deployed
repo = git.Repo(search_parent_directories=True)
git_hash = repo.head.object.hexsha
git_branch = repo.active_branch.name
logger.info("Git branch: %s" % git_branch)
logger.info("Git hash: %s" % git_hash)

model_class = "t5"
assert data_args.task_name in {"cos_e", "esnli", "sbic", "sensemaking", "ecqa"}

if training_args.do_train:
    # write command and args to file
    with open(
            os.path.join(training_args.output_dir, "commandline_args.txt"), "w"
    ) as f:
        f.write("Git branch: " + git_branch + "\n")
        f.write("Git hash: " + git_hash + "\n")
        f.write("Command:\n")
        f.write("\n".join(sys.argv[1:]))

# Set seed
set_seed(training_args.seed)
set_other_seeds(training_args.seed)

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
11/20/2022 11:51:19 - INFO - __main__ -   Save path: ./cos_e_output_t5_3b/112022_115119
11/20/2022 11:51:19 - INFO - __main__ -   Git branch: dev
11/20/2022 11:51:19 - INFO - __main__ -   Git hash: 1cbb5c3b4e53baf31cbafc20d9655c63f091f901


In [5]:
training_args.device

device(type='cpu')

In [6]:
import logging
logger = logging.getLogger(__name__)
CONFIG_MAPPING = {"t5": T5Config}
MODEL_MAPPING = {"t5": T5ForConditionalGeneration}
TOKENIZER_MAPPING = {"t5": T5Tokenizer}
model_class = "t5"
tokenizer_name = TOKENIZER_MAPPING[model_class]
logger.info("Loading pretrained tokenizer...")
model_args.tokenizer_name='t5-3b'
tokenizer = tokenizer_name.from_pretrained(model_args.tokenizer_name)#, cache_dir=model_args.cache_dir)

model = T5ForConditionalGeneration.from_pretrained("t5-3b")

11/20/2022 11:51:19 - INFO - __main__ -   Loading pretrained tokenizer...
loading file https://huggingface.co/t5-3b/resolve/main/spiece.model from cache at /home/huangyongfeng/.cache/huggingface/transformers/529487bfb232bc6331b488e0e3f011af7d700beb874529a38613f0c162994f36.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-3b/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-3b/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-3b/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/t5-3b/resolve/main/tokenizer.json from cache at /home/huangyongfeng/.cache/huggingface/transformers/8cc0c6618e070737993bd96f1f5251e1cc850a347fa1ff28c378c89c66e66c80.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading configuration file https://huggingface.co/t5-3b/resolve/main/config.json from cache at /home/huangyong

In [7]:
data_splits = {'train': None, 'validation': None, 'test': None}
original_data_splits = {'train': None, 'validation': None, 'test': None}  
data_args.io_format="t5_fewshot_infilling_without_choices_use_refined_expl"
data_args

DataTrainingArguments(task_name='cos_e', early_stopping_patience=10, overwrite_cache=False, train_predict=False, test_predict=False, dev_predict=False, version_name='v1.11', generations_filepath=None, n_shots=10, fewshot_eval_size=350, io_format='t5_fewshot_infilling_without_choices_use_refined_expl', explanation_sep='explanation', data_path=None, gpt3_max_eval_size=None)

In [8]:
dataset = datasets.load_dataset(data_args.task_name, data_args.version_name)
train_ids_list=[x['id'] for x in dataset["train"]]
dataset['train'][0].keys()

  0%|          | 0/2 [00:00<?, ?it/s]

dict_keys(['answer', 'question', 'choices', 'extractive_explanation', 'abstractive_explanation', 'id'])

In [9]:
#rationale generation labeled data construction 115
import pandas as pd
scr_csqa_labeled_path="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/handwritten_cose_v1.11_examples.csv"
scr_csqa_label_df=pd.read_csv(scr_csqa_labeled_path)
scr_csqa_label_data=datasets.load_dataset('csv', data_files=scr_csqa_labeled_path)
scr_csqa_label_ids_list=[x['id'] for x in scr_csqa_label_data['train']]
scr_csqa_indexs_list=[train_ids_list.index(id_) for id_ in scr_csqa_label_ids_list]
scr_csqa_label_our_explanations_list=[x['our_explanation'] for x in scr_csqa_label_data['train']]
print(scr_csqa_indexs_list)
data_splits={}
data_splits['train']=dataset['train'].select(scr_csqa_indexs_list)

refine_train_data=[]
for kk, (ex,da) in enumerate(zip(scr_csqa_label_our_explanations_list, data_splits['train'])):
#     print(da)
    data_splits['train'][kk]['our_explanation']=ex
    #print(type(data_splits['train'][kk]),data_splits['train'][kk].keys())
    da['our_explanation']=ex
    refine_train_data.append(da)


refine_train_data[0],scr_csqa_label_data['train'][0],data_splits['train'][0]

  0%|          | 0/1 [00:00<?, ?it/s]

[6723, 3269, 12, 6020, 7815, 0, 13, 7, 3556, 4587, 3166, 2607, 1999, 332, 8571, 2, 3520, 6, 1425, 2361, 3764, 7728, 8352, 1273, 5191, 496, 11, 8143, 6929, 4552, 4652, 7764, 6736, 401, 147, 8339, 3128, 5556, 8769, 4417, 8778, 7145, 1279, 6297, 7149, 5884, 10, 5747, 6273, 8, 9504, 8057, 1307, 2183, 1699, 88, 3228, 7178, 2140, 4048, 1041, 1, 3, 5868, 111, 7889, 5370, 8886, 2137, 6980, 9117, 1891, 7827, 1484, 14, 3668, 7090, 5, 7817, 6213, 4656, 5074, 7514, 5896, 4, 9, 7991, 58, 1887, 1598, 3994, 2212, 2274, 6830, 8826, 1604, 9008, 2013, 6521, 4793, 4338, 3923, 8091, 4904, 3938, 3900, 8156, 5806, 8363, 1787, 1496, 5753, 6351, 1104, 1222]


({'id': 'cabdfc174953b4bdb8bdcc89d6592c74',
  'question': 'What is someone not legal to buy alcohol?',
  'choices': ['underage', 'banned', 'adult', 'rules', 'black market'],
  'answer': 'underage',
  'abstractive_explanation': '21 is the legal age',
  'extractive_explanation': 'not legal to buy alcohol',
  'our_explanation': '21 is the legal age to buy alcohol in the US'},
 {'answer': 'underage',
  'question': 'What is someone not legal to buy alcohol?',
  'choices': "['underage', 'banned', 'adult', 'rules', 'black market']",
  'type': 'evaluation',
  'orig_explanation': '21 is the legal age',
  'our_explanation': '21 is the legal age to buy alcohol in the US',
  'id': 'cabdfc174953b4bdb8bdcc89d6592c74'},
 {'answer': 'underage',
  'question': 'What is someone not legal to buy alcohol?',
  'choices': ['underage', 'banned', 'adult', 'rules', 'black market'],
  'extractive_explanation': 'not legal to buy alcohol',
  'abstractive_explanation': '21 is the legal age',
  'id': 'cabdfc174953b4

In [88]:
#rationale unlabeled data construction 991
import pdb
scr_csqa_unlabeled_test_file="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/acceptability_annotations/commonsenseqa_test.csv"
fse_csqa_dev_dataset = datasets.load_dataset('csv', data_files=scr_csqa_unlabeled_test_file)
scr_csqa_unlabeled_test_df=pd.read_csv(scr_csqa_unlabeled_test_file)
fse_csqa_dev_data_dict={}
for kk, da in enumerate(fse_csqa_dev_dataset['train']):
    #pdb.set_trace()
    id_=da['Input.id']
    if da['Answer.acceptable']:
        answer_accept=set(da['Answer.acceptable'].split('|'))
    else:
        answer_accept=set()
    explanation_list=[da['Input.explanation_1'], 
                      da['Input.explanation_2'],
                      da['Input.explanation_3'],
                      da['Input.explanation_4'],
                      da['Input.explanation_5']]
    if id_ not in fse_csqa_dev_data_dict.keys():
        fse_csqa_dev_data_dict[id_]={"index":kk,"id":id_,
                                     "question":da["Input.question"],
                                     'answer':da['Input.gold_label'],
                                     "accept_set_list":[answer_accept],
                                     "explanation_list":explanation_list}
        #[[kk, id_, answer_accept, explanation_list]]
    else:
        fse_csqa_dev_data_dict[id_]["accept_set_list"].append(answer_accept)
    if len(fse_csqa_dev_data_dict[id_]["accept_set_list"])==3:
        fse_csqa_dev_data_dict[id_]["common_expl_list"]=[]
        common_accept_expl_sample=set.intersection(fse_csqa_dev_data_dict[id_]["accept_set_list"][0], 
                                                   fse_csqa_dev_data_dict[id_]["accept_set_list"][1], 
                                                   fse_csqa_dev_data_dict[id_]["accept_set_list"][2])
        for idx in list(common_accept_expl_sample):
            idx=int(idx)-1
            fse_csqa_dev_data_dict[id_]["common_expl_list"].append(fse_csqa_dev_data_dict[id_]["explanation_list"][idx])
        
#discriminate 3/3 id
id_accept_expl_list=[]
our_accept_expl_list=[]
id_unaccept_expl_list=[]
for k,v in fse_csqa_dev_data_dict.items():
    accept_set_list=v['accept_set_list']
    assert len(accept_set_list)==3
    #pdb.set_trace()
    common_accept_expl_sample=set.intersection(accept_set_list[0], accept_set_list[1], accept_set_list[2])
    #print(accept_set_list,common_accept_expl_sample)
    #pdb.set_trace()
    if common_accept_expl_sample:
        id_accept_expl_list.append(k)
        our_expl=""
        for idx in list(common_accept_expl_sample):
            idx=int(idx)-1
            if len(v['explanation_list'][idx]) > len(our_expl):
                our_expl = v['explanation_list'][idx]
                our_accept_expl_list.append(our_expl)
    else:
        id_unaccept_expl_list.append(k)

dev_ids_list=[x['id'] for x in dataset['validation']]
dev_accept_indexs_list=[dev_ids_list.index(id_) for id_ in id_accept_expl_list]
dev_unaccpet_indexs_list=[dev_ids_list.index(id_) for id_ in id_unaccept_expl_list]
dev_accept_data=dataset['validation'].select(dev_accept_indexs_list)
dev_unaccept_data=dataset['validation'].select(dev_unaccpet_indexs_list)



new_dev_accept_data=[]
new_dev_unaccept_data=[]
for oexp, da in zip(our_accept_expl_list,dev_accept_data):
    da["our_explanation"]=oexp
    new_dev_accept_data.append(da)
for da in dev_unaccept_data:
    da["our_explanation"]=oexp
    new_dev_unaccept_data.append(da)
        
new_dev_accept_data[0]
        

  0%|          | 0/1 [00:00<?, ?it/s]

{'id': '5b8a3081c3235d62bc77e2d15f3ad454',
 'question': 'A town between two mountains is located in a what?',
 'choices': ['valley', 'hospital', 'state', 'train station', 'michigan'],
 'answer': 'valley',
 'abstractive_explanation': 'valleys are always between two mountains',
 'extractive_explanation': 'A town between two mountains',
 'our_explanation': 'A town in between mountains presumably would be in a valley, in which case it is plausable that it would be surrounded by heights in every direction.'}

In [92]:
# for k,v in fse_csqa_dev_data_dict.items():
#     #print(v.keys())
#     print("******")
#     print(v['question'])
#     print(v['answer'])
#     print(v['common_expl_list'])
#     print("******")    
for k,v in fse_csqa_train_data_dict.items():
    #print(v.keys())
    print("******")
    print(v['question'])
    print(v['answer'])
    print(v['common_expl_list'])
    print("******")    

******
Where can you likely buy many poems?
book store
['A bookstore sells a variety of books, including poetry books; chains of bookstores sometimes specialize in categories such as poetry or literature.', 'A book store is a place where you can buy books, including poetry books.']
******
******
What could bringing suit do to a rivalry?
aggravation
['Bringing suit against a rival could aggravate the rivalry.']
******
******
A person who yawns and paces to help pass the time is likely feeling what?
boredom
['By yawning and pacing one shows signs of being bored.', 'Yawning and pacing is a sign of being bored, etc...', 'A bored person would pace and yawn to make time go by faster.']
******
******
I only like fiction books, why should I read a non-fiction book?
knowledge
['Non-fiction books contain knowledge.']
******
******
An American might hike a mountain north to get to what?
canada
['If an American hiked north, they would likely end up in Canada.']
******
******
Sam spent most of his 

******
******
A person who is constantly grooming is try to preserve their what?
beauty
['if a person is constantly grooming themselves, it may be for aesthetic reasons so as to look as pretty as they can.', 'People who are constantly grooming are likely trying to preserve their appearance, which is a sign of beauty.']
******
******
Where would you see footprints that happen due to the formation of an impression?
ground
['Foot impressions occur on the ground; they are caused by pressure being exerted on the ground.', 'Footprints happen when someone steps into a surface. If this ground were soft earth, footprints would be an impression in the ground, not a mark on top of the earth.', 'As humans walk, their feet make an impression in the ground, resulting in footprints.']
******
******
Where can someone level up?
video game
['In video games, it is possible to "level up" by gaining experience. Video games are played at a building.', 'Video games often allow players to earn more points or 

James was dedicated to contemplating things.  What did he seek?
enlightenment
[]
******
******
Sally made a mistake.  She came home drunk and endeared the wrong house.   What was the relationship between the house and her neighbor?
belong to
[]
******
******
When one thinks enough about it, what do they realize everything is made of?
everything
[]
******
******
The college freshmen was set on getting a fresh start, he was determined on making friends so did lots of what?
talking
['Lots of people set on getting a fresh start would probably do lots of talking, because talking is important for befriending other people.', 'Making friends often involves talking to people, and if someone is determined to make friends, they might talk to lots of people.', 'Talking to lots of people is a good way to get friends, and making friends is a sure sign of getting a fresh start.']
******
******
What is likely the result of praying leading to the desired outcome?
relief
[]
******
******
Attending meeti

******
The peanuts were for sale in during the 4th inning, where were they being sold?
ballpark
[]
******
******
Sometimes people seem like robots, but really we're all just what?
human
['Robots are synthetic human-like, not flesh and blood and therefore inhuman.', 'Even though some people seem to have lost compassion or feelings toward others, those people are still human. They are not robots.']
******
******
People were talking on the corner, who were they talking with?
with people
[]
******
******
What is something bad unlikely to be to anyone?
advantageous
['If something bad happens, it is not good; it is an advantage to no one.']
******
******
The prisoner would often escape but he was also just as often what?
being caught
['The person may have escaped from jail, but he was also caught and imprisoned at times. Prision escapes are usually wasted if one is only caught after the long escape.']
******
******
Bobby had a toothache so he went to the closet dentist office, which was in a

In [91]:
#rationale unlabeled data construction 991
import pdb
scr_csqa_unlabeled_train_file="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/acceptability_annotations/commonsenseqa_train.csv"
fse_csqa_train_dataset = datasets.load_dataset('csv', data_files=scr_csqa_unlabeled_train_file)
scr_csqa_unlabeled_train_df=pd.read_csv(scr_csqa_unlabeled_train_file)
fse_csqa_train_data_dict={}
for kk, da in enumerate(fse_csqa_train_dataset['train']):
    #pdb.set_trace()
    id_=da['Input.id']
    if da['Answer.acceptable']:
        answer_accept=set(da['Answer.acceptable'].split('|'))
    else:
        answer_accept=set()
    explanation_list=[da['Input.explanation_1'], 
                      da['Input.explanation_2'],
                      da['Input.explanation_3'],
                      da['Input.explanation_4'],
                      da['Input.explanation_5']]
    if id_ not in fse_csqa_train_data_dict.keys():
        fse_csqa_train_data_dict[id_]={"index":kk,"id":id_,
                                     "question":da["Input.question"],
                                     'answer':da['Input.gold_label'],
                                     "accept_set_list":[answer_accept],
                                     "explanation_list":explanation_list}
        #[[kk, id_, answer_accept, explanation_list]]
    else:
        fse_csqa_train_data_dict[id_]["accept_set_list"].append(answer_accept)
        
    if len(fse_csqa_train_data_dict[id_]["accept_set_list"])==3:
        fse_csqa_train_data_dict[id_]["common_expl_list"]=[]
        common_accept_expl_sample=set.intersection(fse_csqa_train_data_dict[id_]["accept_set_list"][0], 
                                                   fse_csqa_train_data_dict[id_]["accept_set_list"][1], 
                                                   fse_csqa_train_data_dict[id_]["accept_set_list"][2])
        for idx in list(common_accept_expl_sample):
            idx=int(idx)-1
            fse_csqa_train_data_dict[id_]["common_expl_list"].append(fse_csqa_train_data_dict[id_]["explanation_list"][idx])

    
#discriminate 3/3 id
id_accept_expl_list=[]
our_accept_expl_list=[]
id_unaccept_expl_list=[]
for k,v in fse_csqa_train_data_dict.items():
    accept_set_list=v['accept_set_list']
    assert len(accept_set_list)==3
    #pdb.set_trace()
    common_accept_expl_sample=set.intersection(accept_set_list[0], accept_set_list[1], accept_set_list[2])
    #print(accept_set_list,common_accept_expl_sample)
    #pdb.set_trace()
    if common_accept_expl_sample:
        id_accept_expl_list.append(k)
        our_expl=""
        for idx in list(common_accept_expl_sample):
            idx=int(idx)-1
            if len(v['explanation_list'][idx]) > len(our_expl):
                our_expl = v['explanation_list'][idx]
                our_accept_expl_list.append(our_expl)
    else:
        id_unaccept_expl_list.append(k)

train_ids_list=[x['id'] for x in dataset['train']]
train_accept_indexs_list=[train_ids_list.index(id_) for id_ in id_accept_expl_list]
train_unaccpet_indexs_list=[train_ids_list.index(id_) for id_ in id_unaccept_expl_list]
train_accept_data=dataset['train'].select(train_accept_indexs_list)
train_unaccept_data=dataset['train'].select(train_unaccpet_indexs_list)



new_train_accept_data=[]
new_train_unaccept_data=[]
for oexp, da in zip(our_accept_expl_list,train_accept_data):
    da["our_explanation"]=oexp
    new_train_accept_data.append(da)
for da in train_unaccept_data:
    da["our_explanation"]=oexp
    new_train_unaccept_data.append(da)
        
new_train_accept_data[0]

  0%|          | 0/1 [00:00<?, ?it/s]

{'id': 'ed53cbea1f21072fab892031b31192d1',
 'question': 'Where can you likely buy many poems?',
 'choices': ['book of poetry',
  'literature book',
  'book store',
  'poetry book',
  'bookshelf'],
 'answer': 'book store',
 'abstractive_explanation': 'book store book',
 'extractive_explanation': 'buy many poems',
 'our_explanation': 'A bookstore sells a variety of books, including poetry books; chains of bookstores sometimes specialize in categories such as poetry or literature.'}

In [12]:
refine_train_data=refine_train_data
refine_dev_data=new_train_accept_data + new_dev_accept_data
refine_test_data=new_train_unaccept_data + new_dev_unaccept_data

refine_train_ids_list=[x['id'] for x in refine_train_data]
refine_dev_ids_list=[x['id'] for x in refine_dev_data]
refine_test_ids_list=[x['id'] for x in refine_test_data]

set(refine_train_ids_list).intersection(set(refine_dev_ids_list)),set(refine_train_ids_list).intersection(set(refine_test_ids_list))



(set(), set())

In [13]:
our_data_splits={}
our_data_splits['train']=refine_train_data
our_data_splits['dev']=refine_dev_data
our_data_splits['test']=refine_test_data
refine_train_data[0].keys()

dict_keys(['id', 'question', 'choices', 'answer', 'abstractive_explanation', 'extractive_explanation', 'our_explanation'])

In [14]:
def list2dict(refine_data):
    refine_data_dict={}
    for key in refine_data[0].keys():
        refine_data_dict[key]=[x[key] for x in refine_data]
    return refine_data_dict

refine_train_data_dict = list2dict(refine_train_data)
our_data_splits['train'] = datasets.Dataset.from_dict(refine_train_data_dict)

refine_dev_data_dict = list2dict(refine_dev_data)
our_data_splits['dev'] = datasets.Dataset.from_dict(refine_dev_data_dict)

refine_test_data_dict = list2dict(refine_test_data)
our_data_splits['test'] = datasets.Dataset.from_dict(refine_test_data_dict)



In [15]:
import datasets
class SequenceCollator:
    def __init__(self, pad_token):
        # self.pad_token_mapping = {
        #     "lm_labels": -100,
        #     "attention_mask": 0,
        #     "decoder_attention_mask": 0,
        #     "input_ids": pad_token,
        # }
        # self.columns = [
        #     "input_ids",
        #     "attention_mask",
        #     "lm_labels",
        #     "decoder_attention_mask",
        # ]
        self.pad_token_mapping = {
            "labels": -100,
            "attention_mask": 0,
            "decoder_attention_mask": 0,
            "input_ids": pad_token,
        }
        self.columns = [
            "input_ids",
            "attention_mask",
            "labels",
            "decoder_attention_mask",
        ]

    def collate_batch(self, examples):

        # batch inputs for training
        batch = {}
        for key in examples[0].keys():
            if key in self.columns:
                tmp_list = []
                for item in examples:
                    tmp_list.append(item[key])

                # pad lists to max length
                if isinstance(tmp_list[0], list):
                    max_length = max(map(len, tmp_list))
                    tmp_list = [
                        el + [self.pad_token_mapping[key]] * (max_length - len(el))
                        for el in tmp_list
                    ]

                batch[key] = torch.tensor(tmp_list, dtype=torch.long)
        return batch
    
    def __call__(self, examples: List[Dict[str, InputDataClass]]) -> Dict[str, torch.Tensor]:
        # re-format inputs for training
        batch = {}
        for key in examples[0].keys():
            if key in self.columns:
                tmp_list = []
                for item in examples:
                    tmp_list.append(item[key])

                # pad lists to max length
                if isinstance(tmp_list[0], list):
                    max_length = max(map(len, tmp_list))
                    tmp_list = [
                        el + [self.pad_token_mapping[key]] * (max_length - len(el))
                        for el in tmp_list
                    ]

                batch[key] = torch.tensor(tmp_list, dtype=torch.long)
        return batch
# dataset = datasets.load_dataset(data_args.task_name, data_args.version_name)

In [16]:
# seq_collector = SequenceCollator(0)
# train_ds = seq_collector.__call__(dataset['train'])
# train_ds
# dataset['train'][0].keys()
for split in ['train','dev','test']:
    our_data_splits[split] = our_data_splits[split].map(
            lambda x: format_instance(
                x,
                tokenizer,
                data_args.explanation_sep,
                datasource=data_args.task_name,
                io_format=data_args.io_format
            ),
            batched=False,
            load_from_cache_file=False,
        )

  0%|          | 0/115 [00:00<?, ?ex/s]

  0%|          | 0/986 [00:00<?, ?ex/s]

  0%|          | 0/255 [00:00<?, ?ex/s]

In [61]:
our_wrong_data_splits={'train':[]}
for x in our_data_splits['train']:
    for ii in range(5):
        new_x={'id':x['id'], 'real_answer':x['answer'], 
               'answer':x['choices'][ii], 'question':x['question'],
               'choices':x['choices'],
               'abstractive_explanation':x['abstractive_explanation'],
               'extractive_explanation':x['extractive_explanation'],
               'our_explanation':x['our_explanation']}
        our_wrong_data_splits['train'].append(new_x)
    #print(x)
    #pdb.set_trace()
# wandb.init()
our_wrong_train_data_dict = list2dict(our_wrong_data_splits['train'])
our_wrong_data_splits['train'] = datasets.Dataset.from_dict(our_wrong_train_data_dict)

In [71]:
our_wrong_data_splits['dev']=[]
for x in our_data_splits['dev']:
    for ii in range(5):
        new_x={'id':x['id'], 'real_answer':x['answer'], 
               'answer':x['choices'][ii], 'question':x['question'],
               'choices':x['choices'],
               'abstractive_explanation':x['abstractive_explanation'],
               'extractive_explanation':x['extractive_explanation'],
               'our_explanation':x['our_explanation']}
        our_wrong_data_splits['dev'].append(new_x)
    #print(x)
    #pdb.set_trace()
# wandb.init()
our_wrong_dev_data_dict = list2dict(our_wrong_data_splits['dev'])
our_wrong_data_splits['dev'] = datasets.Dataset.from_dict(our_wrong_dev_data_dict)

In [72]:
# our_wrong_data_splits['train'][0],our_wrong_data_splits['train'][0].keys()
our_wrong_data_splits['train'] = our_wrong_data_splits['train'].map(
            lambda x: format_instance(
                x,
                tokenizer,
                data_args.explanation_sep,
                datasource=data_args.task_name,
                io_format=data_args.io_format
            ),
            batched=False,
            load_from_cache_file=False,
        )

  0%|          | 0/575 [00:00<?, ?ex/s]

In [70]:
for da in our_wrong_data_splits['dev']:
    print("*******")
    print("question: {}".format(da['question']))
    print("real_answer: {}".format(da['real_answer']))
    print("answer: {}".format(da['answer']))
    print("choices: {}".format(da['choices']))
    print("our_explanation: {}".format(da['our_explanation']))
    inp_ids = torch.tensor(da["input_ids"], device=model.device).reshape(1, -1)
    out = model.generate(
                    inp_ids,
                    max_length=100,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
    skip_special_tokens = False if "infilling" in data_args.io_format else True
    words = tokenizer.decode(out[0].tolist(), skip_special_tokens=skip_special_tokens)
    print("generated explanation: {}".format(words))
    print("#######")

KeyError: 'dev'

In [73]:
# our_wrong_data_splits['train'][0],our_wrong_data_splits['train'][0].keys()
our_wrong_data_splits['dev'] = our_wrong_data_splits['dev'].map(
            lambda x: format_instance(
                x,
                tokenizer,
                data_args.explanation_sep,
                datasource=data_args.task_name,
                io_format=data_args.io_format
            ),
            batched=False,
            load_from_cache_file=False,
        )

  0%|          | 0/4930 [00:00<?, ?ex/s]

In [74]:
for da in our_wrong_data_splits['dev']:
    print("*******")
    print("question: {}".format(da['question']))
    print("real_answer: {}".format(da['real_answer']))
    print("answer: {}".format(da['answer']))
    print("choices: {}".format(da['choices']))
    print("our_explanation: {}".format(da['our_explanation']))
    inp_ids = torch.tensor(da["input_ids"], device=model.device).reshape(1, -1)
    out = model.generate(
                    inp_ids,
                    max_length=100,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
    skip_special_tokens = False if "infilling" in data_args.io_format else True
    words = tokenizer.decode(out[0].tolist(), skip_special_tokens=skip_special_tokens)
    print("generated explanation: {}".format(words))
    print("#######")

*******
question: Where can you likely buy many poems?
real_answer: book store
answer: book of poetry
choices: ['book of poetry', 'literature book', 'book store', 'poetry book', 'bookshelf']
our_explanation: A bookstore sells a variety of books, including poetry books; chains of bookstores sometimes specialize in categories such as poetry or literature.
generated explanation: <pad> <extra_id_0> Poems are written in books, and books of poetry can be purchased.<extra_id_1> </s>
#######
*******
question: Where can you likely buy many poems?
real_answer: book store
answer: literature book
choices: ['book of poetry', 'literature book', 'book store', 'poetry book', 'bookshelf']
our_explanation: A bookstore sells a variety of books, including poetry books; chains of bookstores sometimes specialize in categories such as poetry or literature.
generated explanation: <pad> <extra_id_0> Poems are often included in literature books, and you can buy poetry books at bookstores.<extra_id_1> </s>
#####

generated explanation: <pad> <extra_id_0> Non-fiction books often contain information that is not readily apparent from the cover, and this information may be important to the reader.<extra_id_1> </s>
#######
*******
question: I only like fiction books, why should I read a non-fiction book?
real_answer: knowledge
answer: for fun
choices: ['knowledge', 'been printed new', 'many words', 'for fun', 'cover']
our_explanation: A bored person would pace and yawn to make time go by faster.
generated explanation: <pad> <extra_id_0> Non-fiction books can be interesting, even if they are not well written. This is especially true if the book is well written.<extra_id_1> </s>
#######
*******
question: I only like fiction books, why should I read a non-fiction book?
real_answer: knowledge
answer: cover
choices: ['knowledge', 'been printed new', 'many words', 'for fun', 'cover']
our_explanation: A bored person would pace and yawn to make time go by faster.
generated explanation: <pad> <extra_id_0> No

generated explanation: <pad> <extra_id_0> Fallen leaves often accumulate around trees, and raking leaves from a tree can lead to the accumulation of leaves.<extra_id_1> </s>
#######
*******
question: James raked fallen leaves into a pile. Where might he leave the pile?
real_answer: ground
answer: ground
choices: ['forest', 'roof', 'garden', 'tree', 'ground']
our_explanation: Since one spends most of his time standing when working as a cashier, Sam probably works at a supermarket where groceries are sold. Because his workplace is full of activities and punten, he never gets any rest.
generated explanation: <pad> <extra_id_0> Leaves often fall to the ground after a storm, and it's common for rakes to collect leaves after a storm.<extra_id_1> </s>
#######
*******
question: What happens when we fail at something we attempt?
real_answer: leave
answer: leave
choices: ['leave', 'give up', 'trying', 'we get shunned', 'fail']
our_explanation: Leaves are often raked into piles on the ground.
gen

generated explanation: <pad> <extra_id_0> Highly elastic balls are made of materials that are elastic (elasticity > 0). When thrown against a wall, the ball will stretch.<extra_id_1> </s>
#######
*******
question: John lived in a small island surrounded by flowing liquid H2O   Where might he live?
real_answer: mississippi river
answer: japan
choices: ['japan', 'hawaii', 'mississippi river', 'pacific ocean', 'body of water']
our_explanation: Referees are specialized in sports, and so, they would be expected to know how to referee a sporting event.
generated explanation: <pad> <extra_id_0> Japan is an island country surrounded by water, and so living on an island would likely involve being near water.<extra_id_1> </s>
#######
*******
question: John lived in a small island surrounded by flowing liquid H2O   Where might he live?
real_answer: mississippi river
answer: hawaii
choices: ['japan', 'hawaii', 'mississippi river', 'pacific ocean', 'body of water']
our_explanation: Referees are spe

generated explanation: <pad> <extra_id_0> Small dogs are easy to carry, especially if you have good hands.<extra_id_1> </s>
#######
*******
question: What is a good way to carry a small dog?
real_answer: backpack
answer: backpack
choices: ['good hands', 'backpack', 'chinese food', 'grocery bag', 'basket']
our_explanation: A student has to solve equations to learn a given theory that is presented in class.
generated explanation: <pad> <extra_id_0> Small dogs are often carried in backpacks, which are designed to fit over the shoulders.<extra_id_1> </s>
#######
*******
question: What is a good way to carry a small dog?
real_answer: backpack
answer: chinese food
choices: ['good hands', 'backpack', 'chinese food', 'grocery bag', 'basket']
our_explanation: A student has to solve equations to learn a given theory that is presented in class.
generated explanation: <pad> <extra_id_0> Small dogs are often carried in chinese food carriers, which are made of lightweight materials like chinese food

generated explanation: <pad> <extra_id_0> As it gets colder outside, things (e.g., bodies of water) condense, making them warmer.<extra_id_1> </s>
#######
*******
question: As things get colder, temperature does what?
real_answer: drop to zero
answer: dropping quickly
choices: ['increase', 'drop to zero', 'heat up', 'dropping quickly', 'arctic']
our_explanation: Poison gas is a chemical known for its ability to kill (therefore, if both sides used it, people on both sides died from its use), and it has since gone on to be outlawed in many countries.
generated explanation: <pad> <extra_id_0> As it gets colder outside, the temperature drops quickly.<extra_id_1> </s>
#######
*******
question: As things get colder, temperature does what?
real_answer: drop to zero
answer: arctic
choices: ['increase', 'drop to zero', 'heat up', 'dropping quickly', 'arctic']
our_explanation: Poison gas is a chemical known for its ability to kill (therefore, if both sides used it, people on both sides died from

generated explanation: <pad> <extra_id_0> Cities often have gates to keep out vehicles, and this gatehouse would be at the entrance to the city.<extra_id_1> </s>
#######
*******
question: The roadway passed a sign and gate house, what was behind the gatehouse?
real_answer: subdivision
answer: narnia
choices: ['city', 'narnia', 'subdivision', 'neighborhood', 'countryside']
our_explanation: Soccer games are usually snacks-free affairs; snacks, including popcorn, tend to be present at more recreational venues, but less so in settings where food is unnecessary because there are translatable goods sellers, i.e., vendors that sell things like popcorn
generated explanation: <pad> <extra_id_0> Narnia is a fictional world, accessible only by road, which contains many magical creatures and fantastical creatures.<extra_id_1> </s>
#######
*******
question: The roadway passed a sign and gate house, what was behind the gatehouse?
real_answer: subdivision
answer: subdivision
choices: ['city', 'narnia

generated explanation: <pad> <extra_id_0> A medium is a type of communication device that can carry out telepathic communications.<extra_id_1> </s>
#######
*******
question: What can occur in fencing if you are not careful?
real_answer: severe injuries
answer: competition
choices: ['competition', 'severe injuries', 'fever', 'perspiration', 'small cuts']
our_explanation: A subdivision contains houses that formed around it; therefore, a subdivision has a gatehouse that controls who goes in and out.
generated explanation: <pad> <extra_id_0> Fencing is a contact sport, and accidents can occur during competitions.<extra_id_1> </s>
#######
*******
question: What can occur in fencing if you are not careful?
real_answer: severe injuries
answer: severe injuries
choices: ['competition', 'severe injuries', 'fever', 'perspiration', 'small cuts']
our_explanation: A subdivision contains houses that formed around it; therefore, a subdivision has a gatehouse that controls who goes in and out.


KeyboardInterrupt: 

In [69]:
for da in our_wrong_data_splits['train']:
    print("*******")
    print("question: {}".format(da['question']))
    print("real_answer: {}".format(da['real_answer']))
    print("answer: {}".format(da['answer']))
    print("choices: {}".format(da['choices']))
    print("our_explanation: {}".format(da['our_explanation']))
    inp_ids = torch.tensor(da["input_ids"], device=model.device).reshape(1, -1)
    out = model.generate(
                    inp_ids,
                    max_length=100,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
    skip_special_tokens = False if "infilling" in data_args.io_format else True
    words = tokenizer.decode(out[0].tolist(), skip_special_tokens=skip_special_tokens)
    print("generated explanation: {}".format(words))
    print("#######")

*******
question: What is someone not legal to buy alcohol?
real_answer: underage
answer: underage
choices: ['underage', 'banned', 'adult', 'rules', 'black market']
our_explanation: 21 is the legal age to buy alcohol in the US
generated explanation: <pad> <extra_id_0> 21 is the legal age to buy alcohol in the United States<extra_id_1> </s>
#######
*******
question: What is someone not legal to buy alcohol?
real_answer: underage
answer: banned
choices: ['underage', 'banned', 'adult', 'rules', 'black market']
our_explanation: 21 is the legal age to buy alcohol in the US
generated explanation: <pad> <extra_id_0> A person who is not legally allowed to buy alcohol is a "banned" person.<extra_id_1> </s>
#######
*******
question: What is someone not legal to buy alcohol?
real_answer: underage
answer: adult
choices: ['underage', 'banned', 'adult', 'rules', 'black market']
our_explanation: 21 is the legal age to buy alcohol in the US
generated explanation: <pad> <extra_id_0> A person who is not

generated explanation: <pad> <extra_id_0> A cabinet contains drinks, and you can get drinks out of it.<extra_id_1> </s>
#######
*******
question: Where can someone get a flute?
real_answer: musical instrument store
answer: orchestra
choices: ['orchestra', 'steal it', 'marching band', 'symphony', 'musical instrument store']
our_explanation: A flute is a musical instrument, and it's therefore plausable that one could buy one at a musical instrument store.
generated explanation: <pad> <extra_id_0> Flute players are members of orchestras, and they are often required to carry flutes.<extra_id_1> </s>
#######
*******
question: Where can someone get a flute?
real_answer: musical instrument store
answer: steal it
choices: ['orchestra', 'steal it', 'marching band', 'symphony', 'musical instrument store']
our_explanation: A flute is a musical instrument, and it's therefore plausable that one could buy one at a musical instrument store.
generated explanation: <pad> <extra_id_0> A flute can be sto

KeyboardInterrupt: 

In [17]:
# import wandb
# training_args.run_name=""
training_args.logging_steps=3
training_args.save_steps=5
training_args.evaluation_strategy="epoch"
training_args.num_train_epochs=10
training_args.do_eval=True

In [18]:
training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./cos_e_output_t5_3b/112022_115119,
logging_first_step=False,
logging_steps=3,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters

In [19]:
training_args.per_device_eval_batch_size=8
training_args.per_device_train_batch_size=8
training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./cos_e_output_t5_3b/112022_115119,
logging_first_step=False,
logging_steps=3,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters

In [20]:
# data_splits = {'train': None, 'validation': None, 'test': None}
# original_data_splits = {'train': None, 'validation': None, 'test': None}  
# data_args.io_format="t5_fewshot_infilling_with_choices"
# # Data loading from huggingface's datasets
# if data_args.task_name in {"cos_e", "esnli"}:
#     version_arg = None
#     if data_args.task_name == "cos_e":
#         assert data_args.version_name in {"v1.11", "v1.0"}
#         version_arg = data_args.version_name

#     load_train = True
#     if (not training_args.do_train
#         and not training_args.do_eval
#         and not data_args.train_predict
#     ):
#         # don't load training dataset
#         dataset = {}
#         dataset["train"] = None
#         dataset["validation"] = datasets.load_dataset(
#             data_args.task_name, version_arg, split="validation"
#         )
#         data_splits['validation'] = dataset["validation"]

#         if data_args.task_name == "esnli":
#             dataset["test"] = datasets.load_dataset(data_args.task_name, split="test")
#             data_splits['test'] = dataset["test"]
#         load_train = False
#     else:
#         dataset = datasets.load_dataset(data_args.task_name, version_arg)

#         if data_args.n_shots > 0: # Shots = number of training examples **per label** 
#             if data_args.task_name == 'esnli': # Construct a *balanced* random sample of the size `data_args.n_shots*len(labels)` (for train) or `data_args.fewshot_eval_size` (for eval)
#                 for split in ["train", "validation", "test"]:
#                     split_data = dataset[split]
#                     label_subsets = []
#                     labels = split_data.features['label'].names
#                     sample_size = data_args.n_shots if split == "train" else int(data_args.fewshot_eval_size/len(labels))
#                     if data_args.gpt3_max_eval_size is not None and split != 'train':
#                         assert len(labels) == 3
#                         sample_size = data_args.gpt3_max_eval_size // len(labels)
#                     for label in labels:
#                         # The following is a hack to only run on `neutral` labels of `esnli` to get data for human eval
#                         # if data_args.gpt3_max_eval_size is not None and split != 'train' and label != 'neutral':
#                         #     continue
#                         label_int = split_data.features['label'].str2int(label)
#                         label_set = split_data.filter(lambda example: example['label'] == label_int).shuffle() # all instances of labeled as `label`
#                         label_subset = label_set.select(range(sample_size)) #select `sample_size` random instances labeled as `label`
#                         label_subsets.append(label_subset)
#                     dataset[split] = datasets.concatenate_datasets(label_subsets) #merge all label-specific instances
#             elif data_args.task_name == 'cos_e': 
#                 for split in ["train", "validation"]: 
#                     split_data = dataset[split]
#                     sample_size = data_args.n_shots if split == "train" else int(data_args.fewshot_eval_size) #Shots for QA are not label-specific, i.e., `n_shots` is the training data size
#                     if data_args.gpt3_max_eval_size is not None and split != 'train':
#                         sample_size = data_args.gpt3_max_eval_size
#                     dataset[split] = split_data#.shuffle().select(range(sample_size)) # select `sample_size` random instances
#             else: 
#                 raise ValueError('Only cos_e and esnli are supported by Huggingface datasets.')
#     # Apply method, and format dataset to torch.Tensor outputs
# #     fse_csqa_train_file="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/acceptability_annotations/commonsenseqa_train.csv"
# #     fse_csqa_dev_file="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/acceptability_annotations/commonsenseqa_test.csv"
# #     fse_csqa_train_dataset = datasets.load_dataset('csv', data_files=fse_csqa_train_file)
# #     fse_csqa_dev_dataset = datasets.load_dataset('csv', data_files=fse_csqa_dev_file)
# #     train_ids_list=[x['id'] for x in data_splits["train"]]
# #     dev_ids_list=[x['id'] for x in data_splits["validation"]]
# #     fse_train_ids_list=[x['Input.id'] for x in fse_csqa_train_dataset['train']]
# #     fse_dev_ids_list=[x['Input.id'] for x in fse_csqa_dev_dataset['train']]
# #     fse_train_indexs_list=[train_ids_list.index(id_) for id_ in fse_train_ids_list]
# #     fse_dev_indexs_list=[dev_ids_list.index(id_) for id_ in fse_dev_ids_list]
# #     print(len(fse_train_indexs_list), len(fse_dev_indexs_list))
# #     # print(fse_train_indexs_list,fse_dev_indexs_list)
# #     fse_data_splits={}
# #     data_splits['train']=data_splits["train"].select(fse_train_indexs_list)
# #     data_splits['validation']=data_splits["validation"].select(fse_train_indexs_list)
#     for split in dataset.keys():
#         if dataset[split] is not None:
#             dataset[split] = dataset[split].map(
#                 lambda x: format_instance(
#                     x,
#                     tokenizer,
#                     data_args.explanation_sep,
#                     datasource=data_args.task_name,
#                     io_format=data_args.io_format
#                 ),
#                 batched=False,
#                 load_from_cache_file=False,
#             )
#     data_splits["train"] = deepcopy(dataset["train"])
#     data_splits["validation"] = deepcopy(dataset["validation"])
#     if data_args.task_name == "esnli":
#         data_splits["test"] = deepcopy(dataset["test"])

#     original_data_splits["train"] = deepcopy(dataset["train"])
#     original_data_splits["validation"] = deepcopy(dataset["validation"])
#     if data_args.task_name == "esnli":
#         original_data_splits["test"] = deepcopy(dataset["test"])

In [21]:
# import pandas as pd
# # new_data_splits={'train': None, 'validation': None}
# # new_data_splits['train']=deepcopy(dataset["train"])
# # new_data_splits['validation']=deepcopy(dataset["validation"])
# fse_csqa_train_file="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/acceptability_annotations/commonsenseqa_train.csv"
# fse_csqa_dev_file="/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/few_shot_explanations/data/acceptability_annotations/commonsenseqa_test.csv"
# # fse_csqa_train_dataset = datasets.load_dataset('csv', data_files=fse_csqa_train_file)
# # fse_csqa_dev_dataset = datasets.load_dataset('csv', data_files=fse_csqa_dev_file)

# train_df=pd.read_csv(fse_csqa_train_file)

# dev_df=pd.read_csv(fse_csqa_dev_file)

# dev_df


In [22]:
# len(fse_train_ids_list),len(list(set(fse_train_ids_list)))

In [23]:
# train_ids_list=[x['id'] for x in data_splits["train"]]
# dev_ids_list=[x['id'] for x in data_splits["validation"]]
# fse_train_ids_list=[x['Input.id'] for x in fse_csqa_train_dataset['train']]
# fse_dev_ids_list=[x['Input.id'] for x in fse_csqa_dev_dataset['train']]


In [24]:
# fse_train_indexs_list=[train_ids_list.index(id_) for id_ in fse_train_ids_list]
# fse_dev_indexs_list=[dev_ids_list.index(id_) for id_ in fse_dev_ids_list]
# print(len(fse_train_indexs_list), len(fse_dev_indexs_list))
# # print(fse_train_indexs_list,fse_dev_indexs_list)
# fse_data_splits={}
# fse_data_splits['train']=data_splits["train"].select(fse_train_indexs_list)
# fse_data_splits['validation']=data_splits["validation"].select(fse_train_indexs_list)

In [25]:
# fse_csqa_train_dataset['train'][0].keys()

In [26]:
class SequenceCollator:
    def __init__(self, model, pad_token):
        self.model = model
        self.pad_token_mapping = {
            "labels": -100,
            "attention_mask": 0,
            "decoder_attention_mask": 0,
            "input_ids": pad_token,
        }

        self.columns = [
            "input_ids",
            "attention_mask",
            "labels",
            "decoder_attention_mask",
        ]

    def __call__(self, examples: List[Dict[str, InputDataClass]]) -> Dict[str, torch.Tensor]:
        # re-format inputs for training
        batch = {}
        for key in examples[0].keys():
            if key in self.columns:
                tmp_list = []
                for item in examples:
                    tmp_list.append(item[key])

                # pad lists to max length
                if isinstance(tmp_list[0], list):
                    max_length = max(map(len, tmp_list))
                    tmp_list = [
                        el + [self.pad_token_mapping[key]] * (max_length - len(el))
                        for el in tmp_list
                    ]

                batch[key] = torch.tensor(tmp_list, dtype=torch.long)
        return batch

In [27]:
# os.environ["WANDB_DISABLED"] = "True"
if data_args.generations_filepath is None:
    callbacks = [TensorBoardCallback()]
    if data_args.early_stopping_patience > 0:
        callbacks.append(EarlyStoppingCallback(early_stopping_patience=data_args.early_stopping_patience))
        training_args.load_best_model_at_end = True
    else:
        training_args.load_best_model_at_end = False  # use the last model state
    training_args.metric_for_best_model = 'eval_loss'
    training_args.greater_is_better = False
    if training_args.eval_steps is None:
        training_args.evaluation_strategy = EvaluationStrategy.EPOCH
    else:
        training_args.evaluation_strategy = EvaluationStrategy.STEPS

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=our_data_splits['train'],
        eval_dataset=our_data_splits['dev'],
        data_collator=SequenceCollator(
            model=model_class, pad_token=tokenizer.pad_token_id
        ),
        callbacks=callbacks,
    )

# Training. Don't train if it is use_gpt3
if training_args.do_train and not model_args.use_gpt3:
    start_time = time.time()
    trainer.train()
    train_time = time.time() - start_time
    model = trainer.model
    wandb.finish()
else:
    start_time = time.time()
    train_time = time.time() - start_time

You are adding a <class 'transformers.integrations.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: answer, question, choices, extractive_explanation, abstractive_explanation, question_encoding, our_explanation, id.
***** Running training *****
  Num examples = 115
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 150
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


Saving model checkpoint to ./cos_e_output_t5_3b/112022_115119/checkpoint-5
Configuration saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-5/config.json
Model weights saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-5/pytorch_model.bin
Saving model checkpoint to ./cos_e_output_t5_3b/112022_115119/checkpoint-10
Configuration saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-10/config.json
Model weights saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-10/pytorch_model.bin
Saving model checkpoint to ./cos_e_output_t5_3b/112022_115119/checkpoint-15
Configuration saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-15/config.json
Model weights saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-15/pytorch_model.bin
Saving model checkpoint to ./cos_e_output_t5_3b/112022_115119/checkpoint-20
Configuration saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-20/config.json
Model weights saved in ./cos_e_output_t5_3b/112022_115119/checkpoint-20/pytorch_model.bin
Sav

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,10.0
train/global_step,150.0
train/learning_rate,0.0
train/loss,0.518
train/total_flos,778264850915328.0
train/train_loss,1.16284
train/train_runtime,8199.7546
train/train_samples_per_second,0.14
train/train_steps_per_second,0.018


In [52]:
print(our_data_splits['dev'][0]['our_explanation'])
inp_ids=torch.tensor(our_data_splits['dev'][0]["input_ids"], device=model.device).reshape(1, -1)
out = model.generate(
                    inp_ids,
                    max_length=100,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
skip_special_tokens = False if "infilling" in data_args.io_format else True
words = tokenizer.decode(out[0].tolist(), skip_special_tokens=skip_special_tokens)
print(words)

A bookstore sells a variety of books, including poetry books; chains of bookstores sometimes specialize in categories such as poetry or literature.
<pad> <extra_id_0> Poems are often published in books, and book stores often sell poetry.<extra_id_1> </s>


In [56]:
len(our_data_splits['dev']),len(our_data_splits['train'])

(986, 115)

In [None]:
our_data_splits['dev']

In [57]:
for da in our_data_splits['train']:
    print("*******")
    print("question: {}".format(da['question']))
    print("answer: {}".format(da['answer']))
    print("choices: {}".format(da['choices']))
    print("our_explanation: {}".format(da['our_explanation']))
    inp_ids = torch.tensor(da["input_ids"], device=model.device).reshape(1, -1)
    out = model.generate(
                    inp_ids,
                    max_length=100,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
    skip_special_tokens = False if "infilling" in data_args.io_format else True
    words = tokenizer.decode(out[0].tolist(), skip_special_tokens=skip_special_tokens)
    print("generated explanation: {}".format(words))
    print("#######")

*******
question: What is someone not legal to buy alcohol?
answer: underage
choices: ['underage', 'banned', 'adult', 'rules', 'black market']
our_explanation: 21 is the legal age to buy alcohol in the US
generated explanation: <pad> <extra_id_0> 21 is the legal age to buy alcohol in the United States<extra_id_1> </s>
#######
*******
question: When you are expressing yourself by yelling after getting a bruise, what are you feeling?
answer: pain
choices: ['self pity', 'communication', 'understood', 'pain', 'embarrassment']
our_explanation: A bruise is caused when a person is hit by an object forcefully --- such an action would be quite painful to the person, and people often yell when they are caused pain.
generated explanation: <pad> <extra_id_0> A bruise is caused when a person is hit by an object forcefully, e.g., by falling on a hard surface. This can cause pain, which can be expressed by yelling.<extra_id_1> </s>
#######
*******
question: A cat can't talk, but a cat can what?
answe

generated explanation: <pad> <extra_id_0> Cities often receive grants from federal or state governments to pay for infrastructure projects, and this money usually comes from grants.<extra_id_1> </s>
#######
*******
question: John runs a small corner shop.   If you were walking on a sidewalk near it, where would you look for it?
answer: street corner
choices: ['england', 'street corner', 'minnesota', 'arizona', 'iowa']
our_explanation: Corner shops often are run out of buildings on street corners.
generated explanation: <pad> <extra_id_0> Corner shops are often run out of buildings on street corners.<extra_id_1> </s>
#######
*******
question: The ladies at the salon had plenty of curiosity, anytime someone came in they wanted to what?
answer: hear news
choices: ['examine thing', 'go to market', 'attend school', 'hear news', 'bad news']
our_explanation: Curiosity is satisfied with information, and news provides information, and a newcomer usually carries with them some news.
generated ex

KeyboardInterrupt: 

In [54]:
for da in our_data_splits['dev']:
    print("*******")
    print("question: {}".format(da['question']))
    print("answer: {}".format(da['answer']))
    print("choices: {}".format(da['choices']))
    print("our_explanation: {}".format(da['our_explanation']))
    inp_ids = torch.tensor(da["input_ids"], device=model.device).reshape(1, -1)
    out = model.generate(
                    inp_ids,
                    max_length=100,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
    skip_special_tokens = False if "infilling" in data_args.io_format else True
    words = tokenizer.decode(out[0].tolist(), skip_special_tokens=skip_special_tokens)
    print("generated explanation: {}".format(words))
    print("#######")

*******
question: Where can you likely buy many poems?
answer: book store
our_explanation: A bookstore sells a variety of books, including poetry books; chains of bookstores sometimes specialize in categories such as poetry or literature.
generated explanation: <pad> <extra_id_0> Poems are often published in books, and book stores often sell poetry.<extra_id_1> </s>
#######
*******
question: What could bringing suit do to a rivalry?
answer: aggravation
our_explanation: Bringing suit against a rival could aggravate the rivalry.
generated explanation: <pad> <extra_id_0> A lawsuit can be brought against a competitor to enrage them; if the other party loses, it could lead to a bitter dispute between the two parties.<extra_id_1> </s>
#######
*******
question: A person who yawns and paces to help pass the time is likely feeling what?
answer: boredom
our_explanation: By yawning and pacing one shows signs of being bored.
generated explanation: <pad> <extra_id_0> Boredom is a common cause of ya

generated explanation: <pad> <extra_id_0> Subdivisions are made up of houses, and the gatehouses are used to control access to the subdivisions.<extra_id_1> </s>
#######
*******
question: The invasive A.I. was designed to scan the crowd, where most saw a mass of humanity the evil machine could pick out each what?
answer: individual
our_explanation: Files are likely to be hanging in a filing cabinet, since they are often organized by topic, and filing cabinets are often used to organize files.
generated explanation: <pad> <extra_id_0> A.I.s are designed to pick out individuals from a crowd, and because crowds are made up of many people, it is difficult to pick out individual people from a crowd.<extra_id_1> </s>
#######
*******
question: Is has been speculated that a aliens could be communicating by doing this?
answer: thinking
our_explanation: Vacations often require the purchase of travel services, such as airline or hotel tickets.
generated explanation: <pad> <extra_id_0> Aliens are 

generated explanation: <pad> <extra_id_0> Meeting new people with similar attitudes can result in new friendships, since people with similar attitudes are more likely to share common interests.<extra_id_1> </s>
#######
*******
question: Danny noticed a ceramic object on the table and picked a grape out of it.  What might have been on the table?
answer: bowl of fruit
our_explanation: When someone works in an office, even if it is part-time or freelance, they may be required to measure distances e.g., in departments or cubicles, and so, offices are a good place to find ruler.
generated explanation: <pad> <extra_id_0> Fruit is often served in bowls, and a bowl of fruit might contain grapes.<extra_id_1> </s>
#######
*******
question: The nose dipped and alarms went off inside the what?
answer: aircraft
our_explanation: Most of the things a person knows come from learning in the past and usually have some recollection involved.
generated explanation: <pad> <extra_id_0> An aircraft is a mach

generated explanation: <pad> <extra_id_0> People often get hungry in the middle of the day, because they are likely to be working.<extra_id_1> </s>
#######
*******
question: Where is garbage brought to by garbagemen?
answer: landfill
our_explanation: Sitting down makes one comfortable; sitting down with eyes closed makes one fall asleep.
generated explanation: <pad> <extra_id_0> Garbage is usually disposed of at landfills, and garbage men are often the ones who transport the garbage to these landfills.<extra_id_1> </s>
#######
*******
question: What are people doing when washing hands for a long time?
answer: thinking
our_explanation: When standing, the chances of falling asleep abruptly would be much less (assuming that one is awake).
generated explanation: <pad> <extra_id_0> People often wash their hands for a long time because they are thinking, and thinking requires concentration.<extra_id_1> </s>
#######
*******
question: What would someone say about a tube television?
answer: obs

generated explanation: <pad> <extra_id_0> Pool parties are often held in warm weather, and entertaining guests in a pool can be a way to cool off in the summer.<extra_id_1> </s>
#######
*******
question: If you wanted to confirm information that you had, you would do what to someone?
answer: question
our_explanation: When you eat hamburger too fast, it can quickly turn into the consistency of a block, and when you can't swallow it, it can cause you to choke.
generated explanation: <pad> <extra_id_0> Confirmation is the act of asking someone else for confirmation of information that you already know.<extra_id_1> </s>
#######
*******
question: Where is cheese likely to be dangerous for some creatures?
answer: mouse trap
our_explanation: Choking can occur when you stick stuff in your mouth which can't go down your throat properly, so that you can't breath. Common causes can be eating too fast, or swallowing items that can't go down your throat, like liquids or large
generated explanation:

generated explanation: <pad> <extra_id_0> Trains tend to arrive on time, whereas airplanes tend to arrive late.<extra_id_1> </s>
#######
*******
question: Where is glue well used?
answer: art room
our_explanation: Someone convicted of a petty crime is likely to spend time in jail.
generated explanation: <pad> <extra_id_0> Art is a creative activity, and glue is often used in art projects.<extra_id_1> </s>
#######
*******
question: Billy loves science, history, and art but isn't very creative.  When he was visiting the museum, he felt a great deal of what emotion?
answer: enjoyment
our_explanation: Someone convicted of a petty crime probably spends some time in jail, and not in prison.
generated explanation: <pad> <extra_id_0> Museums are filled with fascinating exhibits, and visiting them can be a great way to learn about a subject. However, if you aren't very creative, you might not feel much enjoyment from visiting them.<extra_id_1> </s>
#######
*******
question: Sarah did much hard 

generated explanation: <pad> <extra_id_0> When a couple is under a lot of stress, they might argue. This is because they are trying to do too much at once.<extra_id_1> </s>
#######
*******
question: Where can you learn about a ferret with special abilities far beyond normal ferrets?
answer: comic book
our_explanation: A state's fair is a public celebration that may include a horse show.
generated explanation: <pad> <extra_id_0> Comic books often feature super-powered ferrets, which are fictional characters with super-human abilities.<extra_id_1> </s>
#######
*******
question: When people have guests over, what do they often do for their guests?
answer: cook food
our_explanation: Horse shows are sometimes included in state fair celebrations, and these fairs frequently are held in public parks.
generated explanation: <pad> <extra_id_0> When people have guests over, they often prepare food for them.<extra_id_1> </s>
#######
*******
question: Talking to someone is pointless if they're doin

generated explanation: <pad> <extra_id_0> Hanging out is when teens are together without doing anything specific. They might go to a club, watch a movie, or play video games.<extra_id_1> </s>
#######
*******
question: The saw was not necessary for car repair, so what wasn't it included with during pucrhase?
answer: tool kit
our_explanation: Victory in a game requires one to make a maximum effort, and peoples' desire to win has to be rather competitive.
generated explanation: <pad> <extra_id_0> A tool kit contains a variety of tools that can be used to repair cars, but a saw is not included in a standard tool kit.<extra_id_1> </s>
#######
*******
question: Where will my kid put their coin in their room?
answer: piggy bank
our_explanation: Competitive athletes would want to win their games, whatever the cost; this desire to win leads to competitiveness and creates the desire to win over even long periods of time; it also often leads to brutal physical play --- such as by causing injuries

generated explanation: <pad> <extra_id_0> Airports are often used by travelers to wait out the long wait before their flights, and to relax after their flights.<extra_id_1> </s>
#######
*******
question: Where might a small ball be bounced from?
answer: child's hand
our_explanation: Stress is often related to family life. Being under stress, the wife might verbally lash out at her husband, which could potentially lead to arguments
generated explanation: <pad> <extra_id_0> A child's hand is a natural place to bounce a ball, and it's likely that a child would be holding the ball when it's bounced.<extra_id_1> </s>
#######
*******
question: Where do families begin?
answer: wedding
our_explanation: ferrets are lovable pets that occasionally pop up in comic or animated shows and movies. Occasionally a ferret is rodent, but more often they are the companion of a hero.
generated explanation: <pad> <extra_id_0> Families are formed at weddings, and so the term "family" comes from the word "wedd

generated explanation: <pad> <extra_id_0> Physical fitness is a measure of one's physical health, and exercising can improve one's physical health.<extra_id_1> </s>
#######
*******
question: Where could you see a sloth that is not real?
answer: picture book
our_explanation: Using a steel pen requires someone to hold it, and we associate writing with hands. Powder can be used with steel pens as a "contrast medium.
generated explanation: <pad> <extra_id_0> Sloths are often depicted in picture books, and in those books, the sloths are not real.<extra_id_1> </s>
#######
*******
question: Where are seats often plush?
answer: opera
our_explanation: A phone book is likely to have the phone numbers of nearby people.
generated explanation: <pad> <extra_id_0> Operas are often very crowded, and the seats are often very plush.<extra_id_1> </s>
#######
*******
question: The comforter was used constantly and treaded upon by everyone, where should it be placed?
answer: washing machine
our_explanation

generated explanation: <pad> <extra_id_0> Schools often have smoking areas, and one of those areas is the bathroom.<extra_id_1> </s>
#######
*******
question: What is a person using their whole body for work likely to complain of?
answer: sore muscles
our_explanation: Skin is something you peel from fruits like apples, peaches, oranges, kiwi, etc...
generated explanation: <pad> <extra_id_0> People who work sedentary jobs are likely to complain of sore muscles, since they are using their whole body.<extra_id_1> </s>
#######
*******
question: Dogs get hot and tired, then they like to sloppily do what?
answer: drink water
our_explanation: To carry knives on one's person, you need some small type of container to hold the knives. Perhaps the most likely place to carry a small container is a backpack.
generated explanation: <pad> <extra_id_0> When dogs get hot and tired, they like to drink water.<extra_id_1> </s>
#######
*******
question: Where is a horse likely to live?
answer: farm yard
ou

generated explanation: <pad> <extra_id_0> Procreation is a physically demanding activity, and a long session of procreation can be exhausting.<extra_id_1> </s>
#######
*******
question: Where would you throw some pennies?
answer: water fountain
our_explanation: His making a decision to change and ensuring the decision is made permanent indicates a desire for a stable, permanent life, which does not change.
generated explanation: <pad> <extra_id_0> Water fountains are places where people can throw coins to cool off, and pennies are small enough to fit into the fountain.<extra_id_1> </s>
#######
*******
question: Despite this name out front you will also find beer and wine where too?
answer: liquor store
our_explanation: In changing his life, he wants to make sure he remains in the new lifestyle choices, e.g., not become stagnant to bad choices and fall back into bad choices.
generated explanation: <pad> <extra_id_0> Liquor stores sell alcohol, and beer and wine are often sold together.<

generated explanation: <pad> <extra_id_0> When you do an exercise, you might find that you get tired after doing it for a while.<extra_id_1> </s>
#######
*******
question: Where is not known for apple trees?
answer: wisconsin
our_explanation: When you use your comforter (or any other high-traffic fabric) often, you ought to wash it often too. Therefore it should be kept in the washing machine, to enable quick washing.
generated explanation: <pad> <extra_id_0> Wisconsin is a state known for its many lakes, and therefore it is not known for its apple trees.<extra_id_1> </s>
#######
*******
question: Where would you keep paper files you're currently working on?
answer: desk
our_explanation: The ficus is an indoor plant, such as a plant one would have in one's family room. The family room is the ideal location for an indoor plant like a houseplant.
generated explanation: <pad> <extra_id_0> Desks are places where people usually work, and where they might keep files.<extra_id_1> </s>
#######

generated explanation: <pad> <extra_id_0> Standing erect is a sign of being authroitative, and it makes a person appear more confident.<extra_id_1> </s>
#######
*******
question: What floor do you run between rooms?
answer: hallway
our_explanation: A condominium is a type of housing that is often rented. Condominiums are often located in complexes, and Michigan is a state that is shaped like a mitten.
generated explanation: <pad> <extra_id_0> Hallways are areas of a building that connect rooms. They can be found on the first or second floor.<extra_id_1> </s>
#######
*******
question: Wood has been replaced by what in most people's dwellings?
answer: carpet
our_explanation: A family room is a room in many homes in which families can socialize, often through games. One such game is "connect four.
generated explanation: <pad> <extra_id_0> Wood floors were traditionally made of wood, and have been replaced by carpets.<extra_id_1> </s>
#######
*******
question: Where might a motionless hors

generated explanation: <pad> <extra_id_0> Pretending is a way of playing a role, and to play a role, you need to have imagination.<extra_id_1> </s>
#######
*******
question: The cleanup crew would always find a hub or two during highway clean up, they always came off of a what?
answer: car wheel
our_explanation: The host is the person who greets guests and shows them to their seats.
generated explanation: <pad> <extra_id_0> Hubs are the central part of a wheel, and they are often found during highway clean up.<extra_id_1> </s>
#######
*******
question: Where can a human relax?
answer: park
our_explanation: Business men often also travel frequently, hence they'll often put plane tickets and other travel documents in their briefcases.
generated explanation: <pad> <extra_id_0> Humans can relax in parks, which are open spaces where people can walk around and relax.<extra_id_1> </s>
#######
*******
question: To learn must have the right book, to work efficiently what must one have?
answer: 

generated explanation: <pad> <extra_id_0> Stoics are people who have strong, independent wills.<extra_id_1> </s>
#######
*******
question: When you are drinking liquid, what are you doing?
answer: swallow
our_explanation: Reading the morning paper often leads to aggravation, because the news stories often report about human suffering and misery.
generated explanation: <pad> <extra_id_0> Liquids are swallowed when they are drunk, and we drink liquids to slake our thirst.<extra_id_1> </s>
#######
*******
question: Where would a marmot feel most confined?
answer: petting zoo
our_explanation: T-shirts are thin and folded up, so they can be stored in a drawer, where clothes are typically kept.
generated explanation: <pad> <extra_id_0> Petting zoos are places where animals are kept in enclosures, and marmots are naturally nocturnal. Because they are nocturnal, they would feel most confined in a petting zoo.<extra_id_1> </s>
#######
*******
question: Where would you be unlikely to see footpri

generated explanation: <pad> <extra_id_0> Singing is a way of having fun, and glee is a way of having fun.<extra_id_1> </s>
#######
*******
question: James was an American horse. He thought his home was too hot.  Where might he have lived?
answer: arizona
our_explanation: Soccer is a team-sport that requires kicking a ball.  Thus, soccer games would be a natural place to watch people kick a ball.
generated explanation: <pad> <extra_id_0> Arizona is a very hot state in the United States, and horses are often kept in stables there because of the heat.<extra_id_1> </s>
#######
*******
question: What does exercising immediately lead to?
answer: exhaustion
our_explanation: Once you get tired, the exercise should be taken a break; if you continue doing the exercise, you'll become exhausted, and potentially sick if you do not replenish your energy stores by eating or sleeping.
generated explanation: <pad> <extra_id_0> Exercising immediately after a meal can lead to exhaustion.<extra_id_1> </s

generated explanation: <pad> <extra_id_0> Elephants are captured by humans and sold to zoos for use as pets.<extra_id_1> </s>
#######
*******
question: Danny needed a new mouse, his was jumping around  whenever he moved it.   Where might his mouse be?
answer: desktop
our_explanation: if a person is constantly grooming themselves, it may be for aesthetic reasons so as to look as pretty as they can.
generated explanation: <pad> <extra_id_0> A mouse is usually located on a desktop, and if it's jumping around, it's probably because it's on the desktop.<extra_id_1> </s>
#######
*******
question: Why would a camper start burning logs?
answer: make breakfast
our_explanation: Foot impressions occur on the ground; they are caused by pressure being exerted on the ground.
generated explanation: <pad> <extra_id_0> Campers often cook their breakfast over an open fire, which they start by burning logs.<extra_id_1> </s>
#######
*******
question: Joe rented a loft, which is a type of what?
answer: cit

generated explanation: <pad> <extra_id_0> Materials can be anything, e.g., rocks, atoms, molecules, etc.<extra_id_1> </s>
#######
*******
question: The lab results had been compiled, the scientist began analysing the data because he wanted what?
answer: better understanding
our_explanation: Luck plays an important role at serendipitous moments, but skill is more important across the course of a person's life; people get lucky here and there, but the great ones focus toward learning continuing getting better through dedication.
generated explanation: <pad> <extra_id_0> Scientists analyse data to better understand the world around them.<extra_id_1> </s>
#######
*******
question: Where  might someone find lots of homes with a balcony railing?
answer: apartment complex
our_explanation: Obviously, two people experiencing fear (at the same time) may have disagree on how to handle the situation, the disagreement might result in them separating from each other. But why would things separate?
g

generated explanation: <pad> <extra_id_0> Rural areas are often inundated by rivers and streams, which can lead to flooding of cottages.<extra_id_1> </s>
#######
*******
question: The roadway had fallen into disrepair, the citizens took their complaints to the where?
answer: city
our_explanation: A park provides a relaxing green escape from the hecticness of work and war-zones.
generated explanation: <pad> <extra_id_0> Cities are governed by laws that govern the daily activities of the citizens, and if the citizens were unhappy with the city government's handling of their complaints, they could take legal action against the city.<extra_id_1> </s>
#######
*******
question: After giving assistance to a person who's lost their wallet what is customarily given?
answer: reward
our_explanation: Parks and other places similar provide a relaxing environment most of the time with a variety of less than challenging activities available to do.
generated explanation: <pad> <extra_id_0> When someon

generated explanation: <pad> <extra_id_0> Companies often store their recyclable materials in warehouses, and hiring someone with special needs to collect the cardboard from these warehouses is a good way to hire someone with special needs.<extra_id_1> </s>
#######
*******
question: What is good way to break boredom for kids?
answer: play games
our_explanation: Poems are a form of writing, and nature is a topic that many poets write about.
generated explanation: <pad> <extra_id_0> Playing games is a good way to break boredom for kids.<extra_id_1> </s>
#######
*******
question: The teacher's pet loved answering questions, each correct answer brought her more what than the last?
answer: satisfaction
our_explanation: Prince edward island is part of Canada, and so are the people that live there. Prince edward island is known for potato production.
generated explanation: <pad> <extra_id_0> Correct answers to questions give people satisfaction, and answering questions correctly gives people 

generated explanation: <pad> <extra_id_0> Hard things can be very difficult to do, but if you can just manage to do them, they are called bearable.<extra_id_1> </s>
#######
*******
question: What has happened if you are agreeing with someone but do not keep the agreement?
answer: lie
our_explanation: Divorce has a serious effect on children: their family split and their daily routines, life expectations, and so on are upset.
generated explanation: <pad> <extra_id_0> If you do not keep an agreement, you have broken it. This is called a lie.<extra_id_1> </s>
#######
*******
question: What is best for a rosebush?
answer: temperate climate
our_explanation: If two spouses (generally a mother and a father) are getting divorced, their children often suffer: for example, they may undergo mid-transition moves, lose time with a parent, need to lower their expectations for what they receive financially and
generated explanation: <pad> <extra_id_0> Rosebushes grow best in temperate climates, e.g.,

generated explanation: <pad> <extra_id_0> Juice is a clear, sugary liquid that can be stored in large containers.<extra_id_1> </s>
#######
*******
question: Humans tend to have what round things that dart around?
answer: two eyes
our_explanation: People are tired after a day of activity, and so they go to bed to allow for rest and renewal.
generated explanation: <pad> <extra_id_0> Humans are characterized by having two eyes, which are like little cameras that dart around, capturing images of the world around them.<extra_id_1> </s>
#######
*******
question: If you live by the shore than you probably live in a what?
answer: seaside town
our_explanation: Birthday parties often use balloons to decorate, so balloons are something you might buy for a birthday party.
generated explanation: <pad> <extra_id_0> Seaside towns are located near the sea, and if you live by the sea, you probably live in a seaside town.<extra_id_1> </s>
#######
*******
question: What is someone likely to do when tryin

generated explanation: <pad> <extra_id_0> Punishing someone for making a mistake is often seen as a form of retribution. Retribution usually involves financial penalties, but it could also involve physical punishment.<extra_id_1> </s>
#######
*******
question: Why does grooming require careful consideration.?
answer: mistakes
our_explanation: To have a "better day," one might gain more energy and mental clarity by taking the time to have a good breakfast.
generated explanation: <pad> <extra_id_0> Mistakes can be made when grooming, and mistakes can be costly.<extra_id_1> </s>
#######
*******
question: If you tried really hard to go to sleep sooner than normal, you would probably be trying to also what?
answer: get up early
our_explanation: Captured elephants are kept in zoos for a variety of reasons, including peoples' enjoyment of them and preserving them.
generated explanation: <pad> <extra_id_0> If you try to go to sleep early, you are also probably trying to get up early.<extra_id_

generated explanation: <pad> <extra_id_0> Networking is a way of connecting computers so that they can share resources, such as files and printers, with each other.<extra_id_1> </s>
#######
*******
question: John plays brass instruments in a band that has a lot of diverse people in it and many different instruments.   They mostly play at clubs.  What sort of band might he be in.
answer: jazz band
our_explanation: She wanted to matter to him, and she wanted him to care about her.
generated explanation: <pad> <extra_id_0> Jazz bands are often composed of many different people who play different instruments. Jazz bands often play at clubs, and clubs are often very diverse.<extra_id_1> </s>
#######
*******
question: Where is a good place for a lawn chair?
answer: porch
our_explanation: Acoustic guitars can have their "sound" amplified through the use of headphones. People who like to play musical instruments often play in the recording studio, as the sound engineers there know what good ac

generated explanation: <pad> <extra_id_0> On top of a mountain, the air is thin, and it's therefore harder to breathe.<extra_id_1> </s>
#######
*******
question: What is someone who gets angry after getting drunk likely to participate in?
answer: violence
our_explanation: People use money to buy things, e.g., beer. Being "broke" refers to not having enough money in the bank. So, somebody might not be able to buy beer because they don't have enough cash to make the purchase.
generated explanation: <pad> <extra_id_0> When people are angry, they are likely to do things that cause harm to themselves or to others.<extra_id_1> </s>
#######
*******
question: Where would you buy wearable gold?
answer: jewelry store
our_explanation: When resources are limited for meeting endless demands, the people involved in the competition will experience stress.
generated explanation: <pad> <extra_id_0> Gold jewelry is made from pure gold, and therefore, is wearable. Therefore, gold jewelry can be purchased

generated explanation: <pad> <extra_id_0> A person who is accused of committing a murder is likely to face jail time.<extra_id_1> </s>
#######
*******
question: The person forgot to put a lid on the garbage, raccoons knocked all of it out of the what?
answer: container
our_explanation: Synagogues are places where people pray, and so floors in synagogues are likely to be kept clean for holy purposes.
generated explanation: <pad> <extra_id_0> Garbage is usually disposed of in containers that have lids. If a lid is not present, garbage can be thrown away without risk of raccoons gaining access to it.<extra_id_1> </s>
#######
*******
question: The power went out, so why did the family use a candle?
answer: emit light
our_explanation: to choose a number on the roulette wheel, you have to pick a color; numbers on the wheel are arranged in a circle.
generated explanation: <pad> <extra_id_0> Candles emit light, and if the power went out, the family might have been without power.<extra_id_1> </

generated explanation: <pad> <extra_id_0> If a child is sick, it's likely that they will stay home from school, rather than going to school.<extra_id_1> </s>
#######
*******
question: The graphic designer worked in tile, these digital tiles made up the maps of a what?
answer: computer game
our_explanation: Trees are rooted into the soil, and a garden wall limits how far the tree's roots go.
generated explanation: <pad> <extra_id_0> Computer games are played using maps, which are created digitally using a graphic design program.<extra_id_1> </s>
#######
*******
question: Where could you put some olives if you want to cook them?
answer: pizza
our_explanation: Sam articulated that he wanted answers. He asked his friends for answers for the answers. Answers to important things.
generated explanation: <pad> <extra_id_0> Olives are often added to pizzas, and pizzas are often baked.<extra_id_1> </s>
#######
*******
question: Even though she implored he reply, what did the recipient of the ema

generated explanation: <pad> <extra_id_0> The bald eagle is the national emblem of the United States of America.<extra_id_1> </s>
#######
*******
question: What events are typical, expected and not divine?
answer: humdrum
our_explanation: A rosebush requires a temperate climate to flourish.
generated explanation: <pad> <extra_id_0> Humdrum events are those that are typical, expected and not divine.<extra_id_1> </s>
#######
*******
question: If a person feels they are getting a job but they never get the offer, how might they feel?
answer: let down
our_explanation: A rosebush is likely to grow best in a temperate climate.
generated explanation: <pad> <extra_id_0> If a person is expecting to get a job offer, he or she might feel let down if they do not get the job.<extra_id_1> </s>
#######
*******
question: Sam came to despise his sister.  On the other hand, she felt what for him?
answer: admire
our_explanation: An adrenaline rush is a rush of hormones that are involved in responding to 

generated explanation: <pad> <extra_id_0> A parcel can weigh up to 3 pounds, and a return address label is usually attached to a parcel.<extra_id_1> </s>
#######
*******
question: What kind of water comes from a faucet?
answer: room temperature
our_explanation: Steel cables are often produced in factories.
generated explanation: <pad> <extra_id_0> Water that comes from a faucet is usually room temperature.<extra_id_1> </s>
#######
*******
question: Something that you need to have inside of you when opening a business is a lot of?
answer: determination
our_explanation: Steel cables are usually only finished in factories, by factory workers. ​
generated explanation: <pad> <extra_id_0> Opening a business requires a lot of determination.<extra_id_1> </s>
#######
*******
question: Where might someone drive through rows of apple tree?
answer: countryside
our_explanation: Juice is a thing that can be contained in a large container for keeping for some period of time.
generated explanation: <p

KeyboardInterrupt: 

In [37]:
train_output = trainer.evaluate(our_data_splits['train'].select([0,1,2,3,4]))
train_output

The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: answer, question, choices, extractive_explanation, abstractive_explanation, question_encoding, our_explanation, id.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 8


{'eval_loss': 0.22494062781333923,
 'eval_runtime': 8.4715,
 'eval_samples_per_second': 0.59,
 'eval_steps_per_second': 0.118,
 'epoch': 10.0}

In [31]:
results = {}
if training_args.do_eval:
    start_time = time.time()
    logger.info("*** Evaluate on train set***")
    logger.info(len(data_splits['train']))
    train_output = trainer.evaluate(our_data_splits['train'])
    perplexity = math.exp(train_output["eval_loss"])
    results["perplexity_train"] = perplexity

    # repeat
    logger.info("*** Evaluate on dev set***")
    logger.info(len(data_splits['validation']))
    eval_output = trainer.evaluate(data_splits['validation'])
    perplexity = math.exp(eval_output["eval_loss"])
    results["perplexity_validation"] = perplexity

11/20/2022 15:31:31 - INFO - __main__ -   *** Evaluate on train set***
11/20/2022 15:31:31 - INFO - __main__ -   115
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: answer, question, choices, extractive_explanation, abstractive_explanation, question_encoding, our_explanation, id.
***** Running Evaluation *****
  Num examples = 115
  Batch size = 8


Error: You must call wandb.init() before wandb.log()

In [38]:
data_args.generations_filepath

In [41]:
trainer.state.best_model_checkpoint

In [43]:
training_args.output_dir

'./cos_e_output_t5_3b/112022_115119'