In [91]:
import pandas as pd
import numpy as np
import datasets
from sklearn.model_selection import GroupShuffleSplit

## Load fresh MultiRC Dataset

I want to train with the passage string and question string, which is not contained in the old multirc dataset at `../../bin/multirc_dataset.hf`

I can't find any records of how the old multirc was constructed, and I can't reproduce the splits. It seems that candidate answers may have been generated with an LLM? Most of the reference answers overlap with those found in the MultiRC dataset, but some questions have the same correct answer, so this approach is not reliable.

So I need to create a fresh version of the dataset. This dataset will not be suitable for testing previous models, as I cannot ensure that the test set has no overlap with the training data.

In [78]:
multirc_dd = datasets.load_dataset("super_glue", "multirc", trust_remote_code=True)

In [87]:
# The test set has no labels, so ignore it.
multirc = pd.concat(
    [ds.to_pandas() for ds in multirc_dd.values()][:2]
)

# Expand the `idx` column
multirc[["paragraph_id", "question_id", "answer_id"]] = multirc["idx"].apply(pd.Series)
multirc = multirc.drop(["idx"], axis=1)

In [88]:
# Some reference answers appear across multiple questions
multirc.groupby("answer").size().sort_values(ascending=False).head()

answer
Yes    303
No     276
3       58
2       50
4       43
dtype: int64

In [89]:
multirc.sample(1)

Unnamed: 0,paragraph,question,answer,label,paragraph_id,question_id,answer_id
6854,Dubai's Crown Prince Sheikh Mohamed Bin Rashid...,Which two other things does Dubai already stan...,Being a new trading center,0,112,1326,6854


## Sample

In [93]:
def partition_by_paragraph(df, train_size=0.7, dev_size=0.15, test_size=0.15, random_state=42):
    """
    Split a DataFrame into train, dev, and test sets while ensuring that all rows
    with the same paragraph_id stay in the same partition.
    """
    # Get unique paragraph IDs
    unique_paragraph_ids = df['paragraph_id'].unique()
    
    # Create a temporary DataFrame with just the unique paragraph IDs
    paragraph_df = pd.DataFrame({'paragraph_id': unique_paragraph_ids})
    
    # First split: train vs (dev+test)
    splitter_train = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
    train_idx, temp_idx = next(splitter_train.split(paragraph_df, groups=paragraph_df['paragraph_id']))
    
    train_paragraph_ids = paragraph_df.iloc[train_idx]['paragraph_id'].values
    temp_paragraph_ids = paragraph_df.iloc[temp_idx]['paragraph_id'].values
    
    # Second split: dev vs test (from the temp set)
    temp_paragraph_df = pd.DataFrame({'paragraph_id': temp_paragraph_ids})
    
    # Adjust sizes for the second split
    dev_ratio = dev_size / (dev_size + test_size)
    
    splitter_dev = GroupShuffleSplit(n_splits=1, train_size=dev_ratio, random_state=random_state)
    dev_idx, test_idx = next(splitter_dev.split(temp_paragraph_df, groups=temp_paragraph_df['paragraph_id']))
    
    dev_paragraph_ids = temp_paragraph_df.iloc[dev_idx]['paragraph_id'].values
    test_paragraph_ids = temp_paragraph_df.iloc[test_idx]['paragraph_id'].values

    dd = datasets.DatasetDict({
        "train": datasets.Dataset.from_pandas(
            df[df['paragraph_id'].isin(train_paragraph_ids)], preserve_index=False
        ),
        "dev": datasets.Dataset.from_pandas(
            df[df['paragraph_id'].isin(dev_paragraph_ids)], preserve_index=False
        ),
        "test": datasets.Dataset.from_pandas(
            df[df['paragraph_id'].isin(test_paragraph_ids)], preserve_index=False
        ),
    })

    return dd

dd = partition_by_paragraph(multirc)
dd

DatasetDict({
    train: Dataset({
        features: ['paragraph', 'question', 'answer', 'label', 'paragraph_id', 'question_id', 'answer_id'],
        num_rows: 21785
    })
    dev: Dataset({
        features: ['paragraph', 'question', 'answer', 'label', 'paragraph_id', 'question_id', 'answer_id'],
        num_rows: 4770
    })
    test: Dataset({
        features: ['paragraph', 'question', 'answer', 'label', 'paragraph_id', 'question_id', 'answer_id'],
        num_rows: 5536
    })
})

In [56]:
archive_path = Path("../../data/RACE.tar.gz")

items = []
    
with tarfile.open(archive_path, "r:gz") as tar:
    for member in tar.getmembers():
        if not member.isfile() or not member.name.endswith('.txt'):
            continue

        item_dict = {}
        _, split, lvl, fn = member.name.split("/")
        item_dict["split"] = split
        item_dict["level"] = lvl

        f = tar.extractfile(member)
        if f is not None:
            item_dict.update(json.load(f))
            items.append(item_dict)

df = pd.DataFrame(items)

df["passage_id"] = df["id"].str.extract("(\d+)")

# Two rows/passages have no questions or options
empty_options = df["options"].map(len) == 0
df = df[~empty_options]

In [58]:
# Explode questions into individual rows

df = df.explode(["answers", "options", "questions"]).reset_index(drop=True)
df = df.rename(columns={
    "id": "filename",
    "answers": "answer",
    "questions": "question",
})
df.index.name = "idx"
df.to_parquet("../../data/RACE.parquet")

In [59]:
# All items have four options
df["options"].map(len).value_counts()

options
4    97687
Name: count, dtype: int64

In [61]:
df.sample(1)

Unnamed: 0_level_0,split,level,answer,options,question,article,filename,passage_id
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
75449,train,middle,B,"[happy, angry, sad, sorry]",The teacher was very _ when he saw Li Lei's...,One day the students were having painting less...,middle5610.txt,5610


In [94]:
dd.save_to_disk("../../data/multirc_contrastive_pairs.hf")

Saving the dataset (0/1 shards):   0%|          | 0/21785 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4770 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5536 [00:00<?, ? examples/s]