In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np

import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('ms_marco', 'v1.1')

In [3]:
dataset


DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})

In [4]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [5]:
train_df = train_data.to_pandas()

In [6]:
train_df

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[]
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[]
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[]
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[]
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[]
...,...,...,...,...,...,...
82321,[The act or action of propagating as a increas...,"{'is_selected': [1, 0, 0], 'passage_text': ['d...",meaning of propagation,102124,description,[]
82322,[Yes],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0, 0], '...",do you have to do a phd to be a clinical psych...,102125,description,[]
82323,[Chablis],"{'is_selected': [0, 1, 0, 0, 0, 0], 'passage_t...",what wine goes with oysters,102126,entity,[]
82324,[1 Lithium carbonate 150 mg capsules. Lithium ...,"{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0, 0], '...",what strengths does lithium come in,102127,description,[]


In [7]:
t_df  = train_df[:10]
t_df

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[]
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[]
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[]
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[]
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[]
5,[Inside the rib cage.],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",where are the lungs located in the back,19704,location,[]
6,[The most expensive patents are international ...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0], 'pas...",cost to get a patent,19705,numeric,[]
7,[],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 0], '...",what does a metabolic acidosis need to reverse...,19706,description,[]
8,"[Sophocles, Aeschylus and Euripides]","{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0, 0], '...",best tragedies of ancient greece,19707,entity,[]
9,[A tree or shrub which produces distinctive co...,"{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0, 0], '...",what is a conifer,19708,description,[]


In [8]:
t_df['passages'][1]

{'is_selected': array([0, 1, 0, 0, 0, 0, 0], dtype=int32),
 'passage_text': array(['In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Democratic candidates; however, his views grew more conservative over time, and in the early 1960s he officially became a Republican. In November 1984, Ronald Reagan was reelected in a landslide, defeating Walter Mondale and his running mate Geraldine Ferraro (1935-), the first female vice-presidential candidate from a major U.S. political party.',
        "From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (even more than, four yea

In [9]:
#  Initialize an empty list or dictionary to store combined passages
all_docs = []

# Iterate over each row in the DataFrame
for index, row in t_df.iterrows():
    passages = row['passages']['passage_text']
    
    # Step 3: Store passages and their corresponding indices
    for i, passage in enumerate(passages):
        all_docs.append({'passage_text': passage, 'index': len(all_docs)})

# Function to select irrelevant passages
def select_irrelevant_passages(relevant_passages):
    # Get the size of relevant passages
    size = len(relevant_passages) # 7
    
    # Filter out passages that are present in relevant_passages
    irrelevant_passages_indices = [i for i in range(len(all_docs)) if all_docs[i]['passage_text'] not in relevant_passages]
    
    # Select irrelevant passages randomly from filtered passages
    irrelevant_indices = np.random.choice(irrelevant_passages_indices, size, replace=False)
    irrelevant_passages = [all_docs[i]['passage_text'] for i in irrelevant_indices]
    
    return irrelevant_passages

print('length of all_docs',len(all_docs))


length of all_docs 89


In [10]:
all_docs[1]['passage_text']

"The Reserve Bank of Australia (RBA) came into being on 14 January 1960 as Australia 's central bank and banknote issuing authority, when the Reserve Bank Act 1959 removed the central banking functions from the Commonwealth Bank. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."

In [11]:
# Example usage:
relevant_indices = t_df['passages'][1]['passage_text'].tolist() # Example relevant passage indices
print(relevant_indices)

['In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Democratic candidates; however, his views grew more conservative over time, and in the early 1960s he officially became a Republican. In November 1984, Ronald Reagan was reelected in a landslide, defeating Walter Mondale and his running mate Geraldine Ferraro (1935-), the first female vice-presidential candidate from a major U.S. political party.', "From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (even more than, four years earlier, Liberal Republican Gerald Ford), supported former California governor (and for

In [12]:
irrelevant_passages = select_irrelevant_passages(relevant_indices)
print("\nIrrelevant Passages:")

# Print irrelevant passages formatted like relevant ones
for passage in irrelevant_passages:
    print(passage)


Irrelevant Passages:
Acidosis occurs when your kidneys and lungs can’t keep your body’s pH in balance. There are two types of acidosis--metabolic and respiratory. Metabolic acidosis occurs when your kidneys can’t get rid of acid buildup or when your body gets rid of too much base. Bases neutralize acids, and vice versa. Respiratory acidosis occurs when your lungs do not properly eliminate the carbon dioxide (CO2). 1 This base helps to keep the blood neutral. 2  Both diarrhea and vomiting can cause this type of acidosis. 3  Lactic acidosis occurs when there is too much lactic acid in your body. 4  Many things can cause a buildup of lactic acid.
your lungs are just inside your ribcage, from the chest in. there is your heart, stomach, spine, some arm muscles, and big arteries and veins between your lungs and your back … skin. there is just the ribcage and a couple big blood vessels between the lungs and the chest. 4 people found this useful. 
A rebuildable atomizer (RBA), often referred 

In [13]:
irrelevant_passages[1]['passage_text']

TypeError: string indices must be integers, not 'str'

# Making triplets

In [14]:
passages = []
n_passages = t_df['passages']
for i in range(len(n_passages)):
  passages.append(n_passages[i]['passage_text'])

In [15]:
def generate_triplets(queries, relevant_docs):
    triplets = []
    for i, query in enumerate(queries):
        relevant_doc = relevant_docs[i].tolist()
        negative_doc = select_irrelevant_passages(relevant_doc)
        triplets.append((query, relevant_doc, negative_doc))
    return triplets

triplets_train = generate_triplets(t_df['query'], passages)

In [16]:
triplets_train[1]

('was ronald reagan a democrat',
 ['In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Democratic candidates; however, his views grew more conservative over time, and in the early 1960s he officially became a Republican. In November 1984, Ronald Reagan was reelected in a landslide, defeating Walter Mondale and his running mate Geraldine Ferraro (1935-), the first female vice-presidential candidate from a major U.S. political party.',
  "From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (even more than, four years earlier, Liberal Republican Gerald Ford), supported

# Tokenize

In [17]:
import sentencepiece as spm

ModuleNotFoundError: No module named 'sentencepiece'

In [18]:
spm.SentencePieceTrainer.train('--input=wiki.txt --model_prefix=m --vocab_size=4771 --normalization_rule_name=nfkc_cf')
sp = spm.SentencePieceProcessor()
sp.load('m.model')

NameError: name 'spm' is not defined

In [None]:
sp.encode_as_pieces(triplets_train)

NameError: name 'triplets_train' is not defined

# prepare embeddings

In [None]:
# from sentence_transformers import SentenceTransformer
from gensim.models import Word2Vec

In [None]:
def prepare_embeddings(triplets):
    # Extract queries, relevant passages, and irrelevant passages from triplets
    queries = [triplet[0] for triplet in triplets]
    relevant_passages = [triplet[1] for triplet in triplets]
    irrelevant_passages = [triplet[2] for triplet in triplets]
    
    # Train Word2Vec model on combined passages
    combined_passages = relevant_passages + irrelevant_passages
    model = Word2Vec(combined_passages, min_count=1, workers=4)
    
    # Generate embeddings for queries, relevant passages, and irrelevant passages
    query_embeddings = [model.wv[query] for query in queries]
    relevant_embeddings = [model.wv[passage] for passage in relevant_passages]
    irrelevant_embeddings = [model.wv[passage] for passage in irrelevant_passages]
    
    return query_embeddings, relevant_embeddings, irrelevant_embeddings


In [None]:
prepare_embeddings(triplets=)