In [1]:
from datasets import load_dataset
import json

DATASET_NAME = "arc_ir"
orders = ["train", "validation", "test"]
dataset = {}
dataset["easy"] = load_dataset("allenai/ai2_arc", "ARC-Easy", split=orders)
dataset["challenge"] = load_dataset("allenai/ai2_arc", "ARC-Challenge", split=orders)

In [2]:
from tqdm import tqdm

parent = "ARC-DA-v1.1"
data_ir = [[] for x in orders]
for idx, order in enumerate(tqdm(orders)):
    for a in tqdm(open(f"{parent}/{order}.jsonl")):
        data_ir[idx].append(json.loads(a))

1250it [00:00, 71916.82it/s]?, ?it/s]
338it [00:00, 78428.57it/s]
1397it [00:00, 74465.19it/s]
100%|██████████| 3/3 [00:00<00:00, 54.57it/s]


In [3]:
data_ir[0][0]

{'question_id': 'ARCEZ_Mercury_7221148',
 'tag': 'EASY-TRAIN',
 'question': "A baby kit fox grows to become an adult with a mass of over 3.5 kg. What factor will have the greatest influence on this kit fox's survival?",
 'answers': ['habitat',
  'amount of predators around',
  'how smart the fox is',
  'the population of predator in the area',
  "the conditions of the fox's habitat",
  'the availability of food',
  'larger predators prevalence',
  'food sources',
  'food availability',
  'availability of food']}

In [4]:
# ['easy' if 'ARCEZ' in ques_id else 'challenge']
clue_map = {}


def cleanit(x):
    if x[-1] == ".":
        return x[:-1]
    return x


for portion_idx in [0, 1, 2]:
    for clue in data_ir[portion_idx]:
        ques_id = clue["question_id"]
        clean = list(set([cleanit(x) for x in clue["answers"]]))
        clue_map[ques_id[ques_id.index("_") + 1 :]] = clean

# for x in clue_map:
#     print(x, len(clue_map[x]))

In [5]:
# portion_idx=0
# for kind in ['easy','challenge']:
#     print(f'for {kind}:')
#     for portion_idx in [0,1,2]:
#         count=0
#         for json_line in dataset[kind][portion_idx]:

#         print(f"portion {portion_idx}, len{len(dataset[kind][portion_idx])}, count{count}")

In [6]:
def get_clue(json_line):
    x = json_line["id"]
    if x in clue_map:
        choices = json_line["choices"]
        answer_key = json_line["answerKey"][0]
        answer_key_idx = ord(answer_key) - (
            ord("A") if answer_key in "ABCDE" else ord("1")
        )
        answer_text = cleanit(choices["text"][answer_key_idx])
        # if 'atom' in answer_text:
        #     print(f"Wrong-ness########### '{answer_text}', clue:'{clue_map[x]}'")
        if answer_text in clue_map[x]:
            return [k for k in clue_map[x] if k != answer_text]
        return clue_map[x]
    return None

In [7]:
def get_prompt(json_line, has_choice=False):
    # try:
    question = json_line["question"]
    choices = json_line["choices"]
    choice_texts = choices["text"]
    perms = list(range(len(choice_texts)))
    choice_texts = [cleanit(choice_texts[perms[i]]) for i in range(len(choice_texts))]
    candidates = " ".join(
        [
            f"({label if has_choice else ' '}) {text}"
            for text, label in zip(choice_texts, choices["label"])
        ]
    ).replace("\n", " ")
    answer_key = json_line["answerKey"][0]
    answer_key_idx = ord(answer_key) - (ord("A") if answer_key in "ABCDE" else ord("1"))
    answer_text = choice_texts[answer_key_idx]
    clue = get_clue(json_line)
    fact = f"Clue: {', '.join(clue)}. " if clue else ""
    prompt = f"{fact}Question: {question} \\n {candidates}"
    # if clue:
    #     print(prompt,'Answer: ', answer_text)
    return prompt, answer_text


# except:
#     print(answer_key)

In [8]:
from tqdm import tqdm
import pickle

container_train = []
for key, datas in dataset.items():
    container_train.extend([get_prompt(x, has_choice=False) for x in tqdm(datas[0])])
    container_train.extend([get_prompt(x, has_choice=False) for x in tqdm(datas[1])])
pickle.dump(container_train, open(f"{DATASET_NAME}_train.pkl", "wb"))
for key, datas in dataset.items():
    pickle.dump(
        [get_prompt(x, has_choice=False) for x in tqdm(datas[2])],
        open(f"{DATASET_NAME}_{key}_test.pkl", "wb"),
    )

100%|██████████| 2251/2251 [00:00<00:00, 11266.82it/s]
100%|██████████| 570/570 [00:00<00:00, 16492.50it/s]
100%|██████████| 1119/1119 [00:00<00:00, 16795.41it/s]
100%|██████████| 299/299 [00:00<00:00, 16109.14it/s]
100%|██████████| 2376/2376 [00:00<00:00, 11959.18it/s]
100%|██████████| 1172/1172 [00:00<00:00, 13758.51it/s]


In [9]:
[x for x in container_train[0] if "Clue" in x]

[]