In [16]:
from pathlib import Path
import tarfile
import json

import pandas as pd
import datasets

## Load JSON-like, Compressed Data to Dataframe

In [56]:
archive_path = Path("../../data/RACE.tar.gz")

items = []
    
with tarfile.open(archive_path, "r:gz") as tar:
    for member in tar.getmembers():
        if not member.isfile() or not member.name.endswith('.txt'):
            continue

        item_dict = {}
        _, split, lvl, fn = member.name.split("/")
        item_dict["split"] = split
        item_dict["level"] = lvl

        f = tar.extractfile(member)
        if f is not None:
            item_dict.update(json.load(f))
            items.append(item_dict)

df = pd.DataFrame(items)

df["passage_id"] = df["id"].str.extract("(\d+)")

# Two rows/passages have no questions or options
empty_options = df["options"].map(len) == 0
df = df[~empty_options]

In [58]:
# Explode questions into individual rows

df = df.explode(["answers", "options", "questions"]).reset_index(drop=True)
df = df.rename(columns={
    "id": "filename",
    "answers": "answer",
    "questions": "question",
})
df.index.name = "idx"
df.to_parquet("../../data/RACE.parquet")

In [59]:
# All items have four options
df["options"].map(len).value_counts()

options
4    97687
Name: count, dtype: int64

In [61]:
df.sample(1)

Unnamed: 0_level_0,split,level,answer,options,question,article,filename,passage_id
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
75449,train,middle,B,"[happy, angry, sad, sorry]",The teacher was very _ when he saw Li Lei's...,One day the students were having painting less...,middle5610.txt,5610


## Convert to Contrastive Pairs

Create four rows, one for each item option, where distractors are labeled as incorrect and the correct answer is labeled as correct.

In [63]:
def construct_pairs(df):
    records = []
    
    option_letters = ["A", "B", "C", "D"]

    for row in df.itertuples():
        for i, option in enumerate(row.options):
            letter = option_letters[i]
            is_correct = 1 if letter == row.answer else 0
            
            records.append({
                "split": row.split,
                "level": row.level,
                "passage": row.article,
                "question": row.question,
                "answer": option,
                "label": is_correct,
                "passage_id": row.passage_id,
                "item_id": row.Index
            })

    # Create the transformed dataframe
    transformed_df = pd.DataFrame(records)
    transformed_df.index.name = "option_id"

    return transformed_df

contrastive_df = construct_pairs(df)
contrastive_df

Unnamed: 0_level_0,split,level,passage,question,answer,label,passage_id,item_id
option_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,test,high,The rain had continued for a week and the floo...,What did Nancy try to do before she fell over?,Measure the depth of the river,0,19432,0
1,test,high,The rain had continued for a week and the floo...,What did Nancy try to do before she fell over?,Look for a fallen tree trunk,0,19432,0
2,test,high,The rain had continued for a week and the floo...,What did Nancy try to do before she fell over?,Protect her cows from being drowned,1,19432,0
3,test,high,The rain had continued for a week and the floo...,What did Nancy try to do before she fell over?,Run away from the flooded farm,0,19432,0
4,test,high,The rain had continued for a week and the floo...,The following are true according to the passag...,It took Lizzie and Nancy about 20 minutes to g...,0,19432,1
...,...,...,...,...,...,...,...,...
390743,dev,middle,Everyone has got two personalities --- the one...,Tina hardly tells her secrets to her friends ....,on her side,0,7740,97685
390744,dev,middle,Everyone has got two personalities --- the one...,What does the passage tell us ?,Sleeping on you side is the best way of sleepi...,0,7740,97686
390745,dev,middle,Everyone has got two personalities --- the one...,What does the passage tell us ?,Changing positions will cause sleeping problems .,0,7740,97686
390746,dev,middle,Everyone has got two personalities --- the one...,What does the passage tell us ?,Sleeping positions show people's secret person...,1,7740,97686


In [64]:
dd = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(contrastive_df[contrastive_df["split"] == "train"]),
    "dev": datasets.Dataset.from_pandas(contrastive_df[contrastive_df["split"] == "dev"]),
    "test": datasets.Dataset.from_pandas(contrastive_df[contrastive_df["split"] == "test"]),
})
dd

DatasetDict({
    train: Dataset({
        features: ['split', 'level', 'passage', 'question', 'answer', 'label', 'passage_id', 'item_id', 'option_id'],
        num_rows: 351464
    })
    dev: Dataset({
        features: ['split', 'level', 'passage', 'question', 'answer', 'label', 'passage_id', 'item_id', 'option_id'],
        num_rows: 19548
    })
    test: Dataset({
        features: ['split', 'level', 'passage', 'question', 'answer', 'label', 'passage_id', 'item_id', 'option_id'],
        num_rows: 19736
    })
})

In [65]:
dd.save_to_disk("../../data/RACE_contrastive_pairs.hf")

Saving the dataset (0/2 shards):   0%|          | 0/351464 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19548 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19736 [00:00<?, ? examples/s]