In [1]:
import pandas as pd
import numpy as np
import datasets
from sklearn.model_selection import GroupShuffleSplit

## Load fresh MultiRC Dataset

I want to train with the passage string and question string, which is not contained in the old multirc dataset at `../../bin/multirc_dataset.hf`

I can't find any records of how the old multirc was constructed, and I can't reproduce the splits. It seems that candidate answers may have been generated with an LLM? Most of the reference answers overlap with those found in the MultiRC dataset, but some questions have the same correct answer, so this approach is not reliable.

So I need to create a fresh version of the dataset. This dataset will not be suitable for testing previous models, as I cannot ensure that the test set has no overlap with the training data.

In [2]:
multirc_dd = datasets.load_dataset("super_glue", "multirc", trust_remote_code=True)

In [7]:
# The test set has no labels, so ignore it.
multirc = pd.concat(
    [ds.to_pandas() for ds in multirc_dd.values()][:2]
)

# Expand the `idx` column
multirc[["passage_id", "question_id", "answer_id"]] = multirc["idx"].apply(pd.Series)
multirc = multirc.rename(columns={"paragraph": "passage"})
multirc = multirc.drop(["idx"], axis=1)

In [8]:
# Some reference answers appear across multiple questions
multirc.groupby("answer").size().sort_values(ascending=False).head()

answer
Yes    303
No     276
3       58
2       50
4       43
dtype: int64

In [12]:
multirc.sample(1)

Unnamed: 0,passage,question,answer,label,passage_id,question_id,answer_id
14748,Triumph and Disaster: The 20th century saw a s...,What resulted in the attack on the American fl...,The Japanese invasion of Britian,0,251,2795,14748


## Sample

In [16]:
def partition_by_passage(df, train_size=0.7, dev_size=0.15, test_size=0.15, random_state=42):
    """
    Split a DataFrame into train, dev, and test sets while ensuring that all rows
    with the same passage_id stay in the same partition.
    
    Returns a Hugging Face DatasetDict with train, dev, and test splits.
    """
    # Get unique passage IDs
    passage_ids = df['passage_id'].unique()
    
    # Create arrays for GroupShuffleSplit
    X = np.arange(len(passage_ids))
    
    # First split: train vs (dev+test)
    splitter_train = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
    train_idx, temp_idx = next(splitter_train.split(X, groups=passage_ids))
    
    train_ids = passage_ids[train_idx]
    temp_ids = passage_ids[temp_idx]
    
    # Adjust sizes for the second split
    dev_ratio = dev_size / (dev_size + test_size)
    
    # Second split: dev vs test (from the temp set)
    splitter_dev = GroupShuffleSplit(n_splits=1, train_size=dev_ratio, random_state=random_state)
    X_temp = np.arange(len(temp_ids))
    dev_idx, test_idx = next(splitter_dev.split(X_temp, groups=temp_ids))
    
    dev_ids = temp_ids[dev_idx]
    test_ids = temp_ids[test_idx]
    
    # Create the DatasetDict
    dd = datasets.DatasetDict({
        "train": datasets.Dataset.from_pandas(
            df[df['passage_id'].isin(train_ids)], preserve_index=False
        ),
        "dev": datasets.Dataset.from_pandas(
            df[df['passage_id'].isin(dev_ids)], preserve_index=False
        ),
        "test": datasets.Dataset.from_pandas(
            df[df['passage_id'].isin(test_ids)], preserve_index=False
        ),
    })

    return dd

dd = partition_by_passage(multirc)
dd

DatasetDict({
    train: Dataset({
        features: ['passage', 'question', 'answer', 'label', 'passage_id', 'question_id', 'answer_id'],
        num_rows: 21785
    })
    dev: Dataset({
        features: ['passage', 'question', 'answer', 'label', 'passage_id', 'question_id', 'answer_id'],
        num_rows: 4770
    })
    test: Dataset({
        features: ['passage', 'question', 'answer', 'label', 'passage_id', 'question_id', 'answer_id'],
        num_rows: 5536
    })
})

In [19]:
dd["train"].features

{'passage': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'passage_id': Value(dtype='int64', id=None),
 'question_id': Value(dtype='int64', id=None),
 'answer_id': Value(dtype='int64', id=None)}

In [17]:
dd.save_to_disk("../../data/multirc_contrastive_pairs.hf")

Saving the dataset (0/1 shards):   0%|          | 0/21785 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4770 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5536 [00:00<?, ? examples/s]