In [1]:
!pip install transformers datasets torch torchvision torchtext


Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [2]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from datasets import load_dataset
import random
import logging

# Set random seed for reproducibility
random.seed(42)


In [3]:
# Load datasets
max_samples = 10000  # Limit the number of samples for each dataset

hotpotqa_data = load_dataset("BeIR/hotpotqa-generated-queries", split="train").select(range(max_samples))
nq_data = load_dataset("BeIR/nq-generated-queries", split="train").select(range(max_samples))
fiqa_data = load_dataset("BeIR/fiqa-generated-queries", split="train").select(range(max_samples))

# Check the loaded datasets
print(hotpotqa_data)
print(nq_data)
print(fiqa_data)




README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/665M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5233329 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/657M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7866640 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/52.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/162444 [00:00<?, ? examples/s]

Dataset({
    features: ['_id', 'title', 'text', 'query'],
    num_rows: 10000
})
Dataset({
    features: ['_id', 'title', 'text', 'query'],
    num_rows: 10000
})
Dataset({
    features: ['_id', 'title', 'text', 'query'],
    num_rows: 10000
})


In [4]:
# Example of accessing relevance labels (if available)
print(hotpotqa_data[0])  # Print the first sample with query, passages, and relevance scores


{'_id': '12', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.', 'query': 'anarchism defined'}


In [5]:
# Define a function to preprocess data with relevance scores
def preprocess_data_with_relevance(data):
    passages = []
    queries = []
    
    for entry in data:
        if 'text' in entry:
            passages.append(entry['text'])
        if 'query' in entry:
            queries.append(entry['query'])
            
    return passages, queries


In [6]:
# Assuming hotpotqa_data, nq_data, and fiqa_data are defined and loaded

# Process each dataset separately
hotpotqa_passages, hotpotqa_queries = preprocess_data_with_relevance(hotpotqa_data)
nq_passages, nq_queries = preprocess_data_with_relevance(nq_data)
fiqa_passages, fiqa_queries = preprocess_data_with_relevance(fiqa_data)

# Check lengths of passages and queries
print(f"HotpotQA - Passages: {len(hotpotqa_passages)}, Queries: {len(hotpotqa_queries)}")
print(f"NQ - Passages: {len(nq_passages)}, Queries: {len(nq_queries)}")
print(f"FIQA - Passages: {len(fiqa_passages)}, Queries: {len(fiqa_queries)}")


HotpotQA - Passages: 10000, Queries: 10000
NQ - Passages: 10000, Queries: 10000
FIQA - Passages: 10000, Queries: 10000


In [7]:
# Define a function to assign binary relevance scores
def assign_binary_relevance(queries, passages):
    relevance_scores = []
    passage_sets = [set(passage.lower().split()) for passage in passages]  # Convert passages to sets
    
    for query in queries:
        query_set = set(query.lower().split())  # Convert query to set
        scores = [1 if query_set.intersection(passage_set) else 0 for passage_set in passage_sets]
        relevance_scores.append(scores)
        
    return relevance_scores


# Generate binary relevance scores for each dataset
hotpotqa_relevance_scores = assign_binary_relevance(hotpotqa_queries, hotpotqa_passages)
nq_relevance_scores = assign_binary_relevance(nq_queries, nq_passages)
fiqa_relevance_scores = assign_binary_relevance(fiqa_queries, fiqa_passages)

# Check lengths after assignment
print(f"HotpotQA Relevance Scores: {len(hotpotqa_relevance_scores)}")
print(f"NQ Relevance Scores: {len(nq_relevance_scores)}")
print(f"FIQA Relevance Scores: {len(fiqa_relevance_scores)}")


HotpotQA Relevance Scores: 10000
NQ Relevance Scores: 10000
FIQA Relevance Scores: 10000


In [8]:
# Load small embedding model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Set device to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [9]:
# Function to get embeddings for passages
def get_embeddings(passages, batch_size=64):
    embeddings = []
    for i in range(0, len(passages), batch_size):
        batch_passages = passages[i:i + batch_size]
        inputs = tokenizer(batch_passages, return_tensors='pt', truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to GPU
        with torch.no_grad():
            embedding = model(**inputs).last_hidden_state.mean(dim=1).cpu()  # Move back to CPU after processing
        embeddings.append(embedding)
    
    return torch.cat(embeddings)

# Generate embeddings for HotpotQA passages
passage_embeddings = get_embeddings(hotpotqa_passages)  # Use hotpotqa_passages
print(f"HotpotQA Passage Embeddings Shape: {passage_embeddings.shape}")

HotpotQA Passage Embeddings Shape: torch.Size([10000, 384])


In [10]:

num_passages = len(hotpotqa_passages)  # Use the processed passages
num_embeddings = passage_embeddings.shape[0]

# Check if they match
if num_passages == num_embeddings:
    print(f"The number of passages ({num_passages}) matches the number of embeddings ({num_embeddings}).")
else:
    print(f"Mismatch: Number of passages is {num_passages}, but number of embeddings is {num_embeddings}.")


The number of passages (10000) matches the number of embeddings (10000).


In [11]:

def rerank_passages(query, passages):
    # Placeholder logic for reranking passages based on the query
    scores = np.random.rand(len(passages))  # Example random scores
    ranked_indices = np.argsort(-scores).tolist()  # Sort indices in descending order
    return ranked_indices

In [12]:
# Function to get relevance scores based on cosine similarity
def get_cosine_similarity_scores(query, passages, passage_embeddings):
    query_embedding = get_embeddings([query])  # Get embedding for the query
    scores = []
    
    for passage_embedding in passage_embeddings:
        score = torch.nn.functional.cosine_similarity(query_embedding, passage_embedding.unsqueeze(0))  
        scores.append(score.item())  # Get the score for each passage
    
    return scores


In [13]:
# Example usage for HotpotQA dataset
query = "what age do children with autism develop?"

# Ensure you are using the embeddings for the correct passages
cosine_similarity_scores = get_cosine_similarity_scores(query, hotpotqa_passages, passage_embeddings)

# Get the indices of the passages ranked by cosine similarity
ranked_indices_without_reranking = np.argsort(cosine_similarity_scores)[::-1]  # Sort by descending similarity

# Top-k passages based on cosine similarity
top_k_passages_without_reranking = [hotpotqa_passages[idx] for idx in ranked_indices_without_reranking[:10]]
print("Top passages without ranking model (by cosine similarity):")
for passage in top_k_passages_without_reranking:
    print(passage[:50])  # Print only the first 50 characters of each passage
    print("...")


Top passages without ranking model (by cosine similarity):
Autism is a neurodevelopmental disorder characteri
...
Developmental psychology is the scientific study o
...
Kid Icarus, known in Japan as Light Mythology: Pal
...
Lafora disease, also called Lafora progressive myo
...
A motor neuron disease (MND) is any of several neu
...
Down syndrome (DS or DNS), also known as trisomy 2
...
The Kocher–Debré–Semelaigne syndrome is hypothyroi
...
Dementia praecox (a "premature dementia" or "preco
...
In the 2011 census, Nepal's population was approxi
...
An intelligence quotient (IQ) is a total score der
...


**
    Evaluate the ranked passages using NDCG (Normalized Discounted Cumulative Gain).
    
    Parameters:
    - ranked_passages: The indices of passages in the ranked order.
    - relevance_scores: A list of relevance scores for each passage (higher is more relevant).
    - k: Number of top passages to consider for NDCG calculation (default is 10).
    
    Returns:
    - ndcg: The NDCG score for the top-k ranked passages.
    **

In [30]:
import numpy as np
from sklearn.metrics import ndcg_score

def evaluate_ndcg_for_query(query, passages, relevance_scores):

    y_true = np.array(relevance_scores)

    # Assuming passages are fixed, generate y_score once
    y_score = np.ones(len(passages))  # Example scoring for all passages

    # Calculate NDCG score
    ndcg = ndcg_score([y_true], [y_score])
    return ndcg

def evaluate_ndcg_for_dataset(queries, passages, relevance_scores): #  Evaluate total NDCG for a dataset.
  

    total_ndcg = 0.0

    # Use an optimized method to avoid redundant calculations
    y_score = np.ones(len(passages))  # Precompute y_score once
    
    for rel_scores in relevance_scores:
        y_true = np.array(rel_scores)
        ndcg = ndcg_score([y_true], [y_score])
        total_ndcg += ndcg

    # Average the NDCG score over the number of queries
    average_ndcg = total_ndcg / len(queries) if queries else 0.0
    return average_ndcg



In [36]:

hotpotqa_total_ndcg = evaluate_ndcg_for_dataset(hotpotqa_queries, hotpotqa_passages, hotpotqa_relevance_scores)
hotpotqa_percentage = hotpotqa_total_ndcg * 100  
nq_total_ndcg = evaluate_ndcg_for_dataset(nq_queries, nq_passages, nq_relevance_scores)
nq_percentage = nq_total_ndcg * 100  
fiqa_total_ndcg = evaluate_ndcg_for_dataset(fiqa_queries, fiqa_passages, fiqa_relevance_scores)
fiqa_percentage = fiqa_total_ndcg * 100 


#Calculate NDCG
average_ndcg_percentage = (hotpotqa_percentage + nq_percentage + fiqa_percentage) / 3
average_ndcg_decimal = average_ndcg_percentage / 100
print(f"\n NDCG@5: {average_ndcg_decimal:.2f}%")


 NDCG@5: 0.93%
