In [1]:
import pandas as pd
import numpy as np

from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
train_dataset = load_dataset('ms_marco', 'v1.1', split='train')
test_dataset = load_dataset('ms_marco', 'v1.1', split='test')

In [10]:
# view structure of the passages column in the dataset
texts = test_dataset['passages'][0]
texts


{'is_selected': [0, 0, 1, 0, 0, 0, 0],
 'passage_text': ['We have been feeding our back yard squirrels for the fall and winter and we noticed that a few of them have missing fur. One has a patch missing down his back and under both arms. Also another has some missing on his whole chest. They are all eating and seem to have a good appetite.',
  'Critters cannot stand the smell of human hair, so sprinkling a barrier of hair clippings around your garden, or lightly working it into the soil when you plant bulbs, apparently does have some merit. The whole thing kind of makes me laugh. It never occurred to me that we are the ones that stink.',
  "Spread some human hair around your vegetable and flower gardens. This will scare the squirrels away because humans are predators of squirrels. It is better if the hair hasn't been washed so the squirrels will easily pick up the human scent.",
  '1 You can sprinkle blood meal around your garden as well. 2  Don’t trap and relocate squirrels. 3  This i

In [11]:
import random

def create_triplets(dataset):
    """
    Create (query, positive_passage, negative_passage) triplets from the given dataset.
    
    Args:
        dataset (list of dict): Each item should have 'Query' and 'Passages' keys. 
                                'Passages' must contain 'is_selected' and 'passage_text'.
    
    Returns:
        list of tuples: Each tuple is (query, positive_passage, negative_passage)
    """
    all_passages = []

    # Pre-collect all passages for negative sampling
    for row in dataset:
        all_passages.extend(row['passages']['passage_text'])


    triplets = []

    for row in dataset:
        query = row['query']
        passages = row['passages']['passage_text']
        labels = row['passages']['is_selected']

        # Find the index of the positive passage
        if 1 not in labels:
            continue  # Skip if no positive passage
        pos_index = labels.index(1)
        positive = passages[pos_index]

        # Select a random negative passage (ensuring it's not from the same row)
        while True:
            negative = random.choice(all_passages)
            if negative != positive and negative not in passages:
                break

        triplets.append((query, positive, negative))

    return triplets


In [12]:

# Generate triplets
train_triplets = create_triplets(train_dataset)
test_triplets = create_triplets(test_dataset)

# Print a sample
for t in test_triplets[:3]:
    print(f"Query: {t[0]}\nPositive: {t[1]}\nNegative: {t[2]}\n{'-'*40}")


Query: does human hair stop squirrels
Positive: Spread some human hair around your vegetable and flower gardens. This will scare the squirrels away because humans are predators of squirrels. It is better if the hair hasn't been washed so the squirrels will easily pick up the human scent.
Negative: Geography [edit]. Eastern Highlands Province is made up of rugged mountain terrain and broad valleys. It has low coastal areas in the Markham and Ramu valleys. The Province's two highest peaks, Mt Tabletop and Mt Michael are located on Kratke and Bismarck Range respectively. The province derives most of its revenue from the production of coffee. Eastern Highlands is the leading producer of coffee in the region. It produces large quantities of coffee annually for export.
----------------------------------------
Query: what are the benefits of fossil fuels
Positive: Benefits of fossil fuels. Fossil fuels are basically the remains of animals and plants and these are good energy resources. The th

In [13]:
from data_prep import save_triplets_to_json, load_triplets_from_json

# Save triplets
save_triplets_to_json(train_triplets, "train_triplets.json")
save_triplets_to_json(test_triplets, "test_triplets.json")