### Convert Dataset to embeddings
1. Convert MS Marco to format with query, 10 positive samples, and 10 negative samples 
2. Convert each of those to embeddings via Glove pre-trained model
3. Prepare input torch tensors for training ML Two Towers model

### **Step 1**: Initialising functions

In [21]:
# 🚀 MS MARCO TO CONTRASTIVE TABLE CONVERSION
import pandas as pd
import numpy as np
import random
from typing import List, Tuple


def extract_positive_passages_vectorized(row) -> List[str]:
    """Extract passages where is_selected = 1 - vectorized version"""
    is_selected = row['passages.is_selected']
    passage_texts = row['passages.passage_text']
    
    if isinstance(is_selected, list) and isinstance(passage_texts, list):
        # Use list comprehension for speed
        return [passage_texts[i] for i, selected in enumerate(is_selected) 
                if selected == 1 and i < len(passage_texts)]
    return []

def precompute_data_structures(df_filtered: pd.DataFrame) -> tuple:
    """Pre-compute data structures for fast access"""
    print("🔧 Pre-computing data structures for speed...")
    
    # 1. Group by query for fast lookup
    query_groups = df_filtered.groupby('query')
    
    # 2. Pre-compute all passages for each query
    query_to_positives = {}
    all_negative_passages = []
    
    for query, group in query_groups:
        # Get all positive passages for this query
        positives = []
        for _, row in group.iterrows():
            positives.extend(extract_positive_passages_vectorized(row))
        query_to_positives[query] = positives
        
        # Collect all passages as potential negatives
        for _, row in group.iterrows():
            passages = row['passages.passage_text']
            if isinstance(passages, list):
                all_negative_passages.extend(passages)
    
    # 3. Create lookup for query metadata
    query_metadata = df_filtered.groupby('query')[['query_type']].first().to_dict()['query_type']
    
    print(f"✅ Pre-computed data for {len(query_to_positives)} queries")
    print(f"✅ Collected {len(all_negative_passages)} total passages for negatives")
    
    return query_to_positives, all_negative_passages, query_metadata

def create_contrastive_table_optimized(df_filtered: pd.DataFrame, num_pos: int = 10, num_neg: int = 10) -> pd.DataFrame:
    """
    OPTIMIZED: Convert MS MARCO dataframe to contrastive table
    """
    print("🚀 Converting MS MARCO to contrastive table (OPTIMIZED)...")
    
    # Pre-compute everything once
    query_to_positives, all_negative_passages, query_metadata = precompute_data_structures(df_filtered)
    
    # Get unique queries
    unique_queries = list(query_to_positives.keys())
    print(f"Processing {len(unique_queries)} unique queries")
    
    results = []
    
    # Process queries with vectorized operations
    for idx, query in enumerate(unique_queries):
        query_type = query_metadata[query]
        all_positives = query_to_positives[query]
        
        # Sample positive passages
        if len(all_positives) >= num_pos:
            sampled_positives = random.sample(all_positives, num_pos)
        elif all_positives:
            # Repeat if not enough
            sampled_positives = (all_positives * (num_pos // len(all_positives) + 1))[:num_pos]
        else:
            # Fallback: use query itself
            sampled_positives = [query] * num_pos
        
        # Sample negative passages from pre-computed pool
        if len(all_negative_passages) >= num_neg:
            sampled_negatives = random.sample(all_negative_passages, num_neg)
        else:
            # Fallback if not enough negatives
            sampled_negatives = (all_negative_passages * (num_neg // len(all_negative_passages) + 1))[:num_neg]
        
        # Create target arrays for different encoder types
        dual_encoder_pos_targets = [1] * num_pos  # [1,1,1,1,1,1,1,1,1,1]
        dual_encoder_neg_targets = [0] * num_neg  # [0,0,0,0,0,0,0,0,0,0]
        
        combined_encoder_pos_targets = [1] * num_pos   # [1,1,1,1,1,1,1,1,1,1] 
        combined_encoder_neg_targets = [0] * num_neg  # [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
        
        # Create row
        results.append({
            'Query': query,
            'Query_Type': query_type,
            'Pos': sampled_positives,
            'Neg': sampled_negatives,
            'Dual_Encoder_Pos_Targets': dual_encoder_pos_targets,
            'Dual_Encoder_Neg_Targets': dual_encoder_neg_targets,
            'Combined_Encoder_Pos_Targets': combined_encoder_pos_targets,
            'Combined_Encoder_Neg_Targets': combined_encoder_neg_targets
        })
        
        if idx % 500 == 0:  # Less frequent updates for speed
            print(f"  Processed {idx}/{len(unique_queries)} queries")
    
    contrastive_df = pd.DataFrame(results)
    print(f"✅ Created contrastive table with {len(contrastive_df)} rows")
    
    return contrastive_df


### **Step 2**: Get MS Marco Dataset

In [19]:
# Loading Raw
NUMBER_OF_SAMPLES = 10000
EMBEDDING_DIM = 200

# Load and sample data FIRST
print("Loading raw data...")
df = pd.read_parquet("../data/ms_marco_train.parquet", engine='fastparquet')

print(f"📊 Sampling {NUMBER_OF_SAMPLES:,} samples...")

df_sample = df.sample(n=NUMBER_OF_SAMPLES, random_state=42).copy()
print(f"Sampled: {len(df_sample)} samples")

Loading raw data...
📊 Sampling 10,000 samples...
Sampled: 10000 samples


### **Step 3**: Transform MS Marco Dataset and include negative sampling

In [22]:
# Apply filtering AFTER sampling
print("Filtering data...")
df_filtered = df_sample[
         (df_sample['query'].notna()) &
         (df_sample['query_id'].notna()) &
         (df_sample['query_type'].notna()) &
         (df_sample['passages.is_selected'].notna()) &
         (df_sample['passages.is_selected'].apply(lambda x: 1 in x))
     ].copy()

print(f"After filtering: {len(df_filtered)} samples")

# Create the simple contrastive table using OPTIMIZED version
contrastive_table = create_contrastive_table_optimized(df_filtered, num_pos=10, num_neg=10)

print("\\n🎯 Sample of the contrastive table:")
print(f"Columns: {list(contrastive_table.columns)}")
print(f"Shape: {contrastive_table.shape}")


print("\\n✅ Contrastive table ready!")
pd.set_option('display.max_columns', None)
print(contrastive_table.head(10))



Filtering data...
After filtering: 6146 samples
🚀 Converting MS MARCO to contrastive table (OPTIMIZED)...
🔧 Pre-computing data structures for speed...
✅ Pre-computed data for 6146 queries
✅ Collected 61265 total passages for negatives
Processing 6146 unique queries
  Processed 0/6146 queries
  Processed 500/6146 queries
  Processed 1000/6146 queries
  Processed 1500/6146 queries
  Processed 2000/6146 queries
  Processed 2500/6146 queries
  Processed 3000/6146 queries
  Processed 3500/6146 queries
  Processed 4000/6146 queries
  Processed 4500/6146 queries
  Processed 5000/6146 queries
  Processed 5500/6146 queries
  Processed 6000/6146 queries
✅ Created contrastive table with 122920 rows
\n🎯 Sample of the contrastive table:
Columns: ['Query', 'Query_Type', 'Passage', 'Target']
Shape: (122920, 4)
\n✅ Contrastive table ready!
                   Query   Query_Type  \
0  'for hire' definition  DESCRIPTION   
1  'for hire' definition  DESCRIPTION   
2  'for hire' definition  DESCRIPTION   


### **Step 4**: Embed all data

In [None]:
import os
import pickle
import numpy as np
import re
import pandas as pd

def load_glove_embeddings():
    """Load cached GloVe embeddings."""
    cache_dir = os.path.dirname("../data/")
    word_to_idx_path = os.path.join(cache_dir, "word_to_idx.pkl")
    embeddings_path = os.path.join(cache_dir, "embeddings.npy")
    
    if os.path.exists(word_to_idx_path) and os.path.exists(embeddings_path):
        with open(word_to_idx_path, 'rb') as f:
            word_to_idx = pickle.load(f)
        embeddings = np.load(embeddings_path)
        return word_to_idx, embeddings
    
    print("❌ GloVe cache not found. Please run original data processing once to create cache.")
    return None, None

def clean_text(text):
    """Clean text for embedding lookup."""
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.split()

def query_to_embedding(query, word_to_idx, embeddings):
    """Convert title to embedding."""
    words = clean_text(query)
    word_embeddings = [embeddings[word_to_idx[word]] for word in words if word in word_to_idx]
    
    if not word_embeddings:
        return np.zeros(embeddings.shape[1])
    
    return np.mean(word_embeddings, axis=0)



In [None]:
import torch
import torch.nn.functional as F
from typing import List, Tuple
import random

def text_to_embedding(text: str, word_to_idx: dict, embeddings: np.ndarray, max_len: int = 50) -> np.ndarray:
    """Convert text to embedding using GloVe embeddings with max pooling."""
    if not text or pd.isna(text):
        return np.zeros(embeddings.shape[1])
    
    words = text.lower().split()[:max_len]  # Limit sequence length
    word_embeddings = []
    
    for word in words:
        if word in word_to_idx:
            word_embeddings.append(embeddings[word_to_idx[word]])
    
    if not word_embeddings:
        return np.zeros(embeddings.shape[1])
    
    # Max pooling across words
    return np.max(word_embeddings, axis=0)

def create_contrastive_dataset(df_filtered: pd.DataFrame, word_to_idx: dict, embeddings: np.ndarray, 
                              num_docs_per_sample: int = 10):
    """
    Create contrastive learning dataset with positive and negative sampling.
    
    Steps:
    1. Pass query through embeddings -> QUERY input
    2. Create 10 positive documents (all targets = [1,1,1,...])
    3. Create 10 negative documents (all targets = [0,0,0,...])
    4. Embed documents with max pooling
    """
    
    print("Creating contrastive dataset...")
    
    # Get unique queries to avoid data leakage
    unique_queries = df_filtered.groupby('query').first().reset_index()
    print(f"Found {len(unique_queries)} unique queries")
    
    query_embeddings = []
    positive_doc_embeddings = []
    negative_doc_embeddings = []
    targets = []
    
    for idx, row in unique_queries.iterrows():
        query = row['query']
        
        # Step 1: Embed the query
        query_emb = text_to_embedding(query, word_to_idx, embeddings)
        query_embeddings.append(query_emb)
        
        # Step 2: Create positive samples - use the query itself as positive document
        # (In practice, you'd use actual relevant documents)
        positive_passages = [query] * num_docs_per_sample
        
        # Step 3: Embed positive documents with max pooling
        pos_doc_embs = [text_to_embedding(passage, word_to_idx, embeddings) for passage in positive_passages]
        positive_doc_embeddings.append(pos_doc_embs)
        
        # Step 4: Create negative samples from other queries
        other_queries = df_filtered[df_filtered['query'] != query]
        negative_passages = []
        
        for _ in range(num_docs_per_sample):
            if len(other_queries) > 0:
                random_doc = other_queries.sample(n=1).iloc[0]
                negative_passages.append(random_doc['query'])
            else:
                negative_passages.append("dummy negative text")
        
        # Step 5: Embed negative documents with max pooling
        neg_doc_embs = [text_to_embedding(passage, word_to_idx, embeddings) for passage in negative_passages]
        negative_doc_embeddings.append(neg_doc_embs)
        
        # Step 6: Create targets
        # Positive samples: all 1s [1,1,1,1,1,1,1,1,1,1]
        # Negative samples: all 0s [0,0,0,0,0,0,0,0,0,0]
        positive_target = [1] * num_docs_per_sample
        negative_target = [0] * num_docs_per_sample
        targets.append([positive_target, negative_target])
        
        if idx % 100 == 0:
            print(f"Processed {idx}/{len(unique_queries)} queries")
    
    # Convert to PyTorch tensors
    query_embeddings = torch.tensor(np.array(query_embeddings), dtype=torch.float32)
    positive_doc_embeddings = torch.tensor(np.array(positive_doc_embeddings), dtype=torch.float32)
    negative_doc_embeddings = torch.tensor(np.array(negative_doc_embeddings), dtype=torch.float32)
    targets = torch.tensor(np.array(targets), dtype=torch.float32)
    
    print(f"\\n✅ Dataset created successfully!")
    print(f"  Query embeddings: {query_embeddings.shape}")
    print(f"  Positive doc embeddings: {positive_doc_embeddings.shape}")
    print(f"  Negative doc embeddings: {negative_doc_embeddings.shape}")
    print(f"  Targets: {targets.shape}")
    print(f"\\nTarget structure:")
    print(f"  targets[i][0] = positive targets (all 1s): {targets[0][0]}")
    print(f"  targets[i][1] = negative targets (all 0s): {targets[0][1]}")
    
    return query_embeddings, positive_doc_embeddings, negative_doc_embeddings, targets

# Load embeddings
print("Loading GloVe embeddings...")
word_to_idx, embeddings = load_glove_embeddings()

if word_to_idx is None:
    print("⚠️ Using dummy embeddings for testing...")
    vocab_size = 10000
    embedding_dim = 100
    word_to_idx = {f"word_{i}": i for i in range(vocab_size)}
    word_to_idx.update({word: i for i, word in enumerate(['what', 'is', 'the', 'how', 'where', 'when', 'why', 'who'])})
    embeddings = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
    print(f"Created dummy embeddings: {embeddings.shape}")
else:
    print(f"Loaded GloVe embeddings: {embeddings.shape}")

# Create the contrastive dataset
query_embs, pos_doc_embs, neg_doc_embs, targets = create_contrastive_dataset(
    df_filtered, word_to_idx, embeddings, num_docs_per_sample=10
)


In [None]:
# Demonstrate the dataset structure
print("📊 Dataset Summary:")
print(f"Number of queries: {len(query_embs)}")
print(f"Embedding dimension: {query_embs.shape[1]}")
print(f"Documents per sample: {pos_doc_embs.shape[1]}")

print("\\n🔍 Sample data:")
print(f"Query embedding shape: {query_embs[0].shape}")
print(f"Positive doc embeddings shape: {pos_doc_embs[0].shape}")
print(f"Negative doc embeddings shape: {neg_doc_embs[0].shape}")
print(f"Positive targets: {targets[0][0]}")  # Should be [1,1,1,1,1,1,1,1,1,1]
print(f"Negative targets: {targets[0][1]}")  # Should be [0,0,0,0,0,0,0,0,0,0]

print("\\n✅ Ready for PyTorch training!")
print("Inputs:")
print(f"  - query_embs: {query_embs.shape} (query embeddings)")
print(f"  - pos_doc_embs: {pos_doc_embs.shape} (positive document embeddings)")
print(f"  - neg_doc_embs: {neg_doc_embs.shape} (negative document embeddings)")
print("Targets:")
print(f"  - targets: {targets.shape} (positive and negative labels)")

# Example of how to use in training loop
print("\\n💡 Training loop example:")
print("for i in range(len(query_embs)):")
print("    query = query_embs[i]  # Shape: [embedding_dim]")
print("    pos_docs = pos_doc_embs[i]  # Shape: [10, embedding_dim]")
print("    neg_docs = neg_doc_embs[i]  # Shape: [10, embedding_dim]")
print("    pos_targets = targets[i][0]  # [1,1,1,1,1,1,1,1,1,1]")
print("    neg_targets = targets[i][1]  # [0,0,0,0,0,0,0,0,0,0]")
print("    # Your model training code here...")


In [None]:
# 🔧 SIMPLIFIED AND MODULAR VERSION
import torch
import random
from typing import List, Tuple, Dict

class EmbeddingProcessor:
    """Simple class to handle text embeddings with max pooling."""
    
    def __init__(self, word_to_idx: Dict, embeddings: np.ndarray):
        self.word_to_idx = word_to_idx
        self.embeddings = embeddings
        self.embedding_dim = embeddings.shape[1]
    
    def embed_text(self, text: str, max_len: int = 50) -> np.ndarray:
        """Convert text to embedding using max pooling."""
        if not text or pd.isna(text):
            return np.zeros(self.embedding_dim)
        
        words = clean_text(text)[:max_len]
        word_embeddings = [self.embeddings[self.word_to_idx[word]] 
                          for word in words if word in self.word_to_idx]
        
        if not word_embeddings:
            return np.zeros(self.embedding_dim)
        
        return np.max(word_embeddings, axis=0)

def extract_passages(df_row) -> Tuple[List[str], List[str]]:
    """Extract positive and negative passages from a data row."""
    is_selected = df_row['passages.is_selected']
    passage_texts = df_row['passages.passage_text']
    
    positive_passages = []
    negative_passages = []
    
    if isinstance(is_selected, list) and isinstance(passage_texts, list):
        for i, selected in enumerate(is_selected):
            if i < len(passage_texts):
                if selected == 1:
                    positive_passages.append(passage_texts[i])
                else:
                    negative_passages.append(passage_texts[i])
    
    return positive_passages, negative_passages

def sample_passages(passages: List[str], target_count: int, fallback_text: str = "") -> List[str]:
    """Sample exactly target_count passages, with fallback handling."""
    if len(passages) >= target_count:
        return random.sample(passages, target_count)
    
    if not passages:
        return [fallback_text] * target_count
    
    # Repeat existing passages to reach target count
    repeated = passages * (target_count // len(passages) + 1)
    return repeated[:target_count]

def create_simple_contrastive_dataset(df_filtered: pd.DataFrame, 
                                    embedder: EmbeddingProcessor,
                                    num_docs: int = 10) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Create contrastive dataset in a simple, modular way.
    
    Returns:
        query_embeddings, positive_embeddings, negative_embeddings, targets
    """
    print("🚀 Creating simplified contrastive dataset...")
    
    # Get unique queries
    unique_queries = df_filtered.groupby('query').first().reset_index()
    print(f"Processing {len(unique_queries)} unique queries")
    
    all_query_embs = []
    all_pos_embs = []
    all_neg_embs = []
    all_targets = []
    
    for idx, row in unique_queries.iterrows():
        query = row['query']
        
        # 1. Embed query
        query_emb = embedder.embed_text(query)
        all_query_embs.append(query_emb)
        
        # 2. Get all passages for this query
        query_docs = df_filtered[df_filtered['query'] == query]
        all_positive = []
        all_negative = []
        
        for _, doc_row in query_docs.iterrows():
            pos_passages, neg_passages = extract_passages(doc_row)
            all_positive.extend(pos_passages)
            all_negative.extend(neg_passages)
        
        # 3. Sample passages
        sampled_positive = sample_passages(all_positive, num_docs, query)
        
        # For negatives, use passages from other queries if needed
        if len(all_negative) < num_docs:
            other_queries = df_filtered[df_filtered['query'] != query]
            for _ in range(num_docs - len(all_negative)):
                if len(other_queries) > 0:
                    random_row = other_queries.sample(n=1).iloc[0]
                    random_passages = random_row['passages.passage_text']
                    if isinstance(random_passages, list) and random_passages:
                        all_negative.append(random.choice(random_passages))
        
        sampled_negative = sample_passages(all_negative, num_docs, f"negative for {query}")
        
        # 4. Embed passages
        pos_embs = [embedder.embed_text(passage) for passage in sampled_positive]
        neg_embs = [embedder.embed_text(passage) for passage in sampled_negative]
        
        all_pos_embs.append(pos_embs)
        all_neg_embs.append(neg_embs)
        
        # 5. Create targets
        pos_targets = [1] * num_docs
        neg_targets = [0] * num_docs
        all_targets.append([pos_targets, neg_targets])
        
        if idx % 100 == 0:
            print(f"  Processed {idx}/{len(unique_queries)} queries")
    
    # Convert to tensors
    query_embeddings = torch.tensor(np.array(all_query_embs), dtype=torch.float32)
    positive_embeddings = torch.tensor(np.array(all_pos_embs), dtype=torch.float32)
    negative_embeddings = torch.tensor(np.array(all_neg_embs), dtype=torch.float32)
    targets = torch.tensor(np.array(all_targets), dtype=torch.float32)
    
    print(f"✅ Dataset created!")
    print(f"  Queries: {query_embeddings.shape}")
    print(f"  Positive docs: {positive_embeddings.shape}")
    print(f"  Negative docs: {negative_embeddings.shape}")
    print(f"  Targets: {targets.shape}")
    
    return query_embeddings, positive_embeddings, negative_embeddings, targets

# Initialize embedder
print("🔧 Setting up embeddings...")
word_to_idx, embeddings = load_glove_embeddings()

if word_to_idx is None:
    print("Using dummy embeddings...")
    vocab_size = 10000
    embedding_dim = 100
    word_to_idx = {f"word_{i}": i for i in range(vocab_size)}
    # Add common words
    common_words = ['what', 'is', 'the', 'how', 'where', 'when', 'why', 'who', 'are', 'does', 'do']
    word_to_idx.update({word: i for i, word in enumerate(common_words)})
    embeddings = np.random.randn(vocab_size, embedding_dim).astype(np.float32)

embedder = EmbeddingProcessor(word_to_idx, embeddings)
print(f"Embedder ready with {embeddings.shape[1]}D embeddings")

# Create dataset
query_embs_simple, pos_embs_simple, neg_embs_simple, targets_simple = create_simple_contrastive_dataset(
    df_filtered, embedder, num_docs=10
)
