### Convert Dataset to embeddings
1. Convert MS Marco to format with query, 10 positive samples, and 10 negative samples 
2. Convert each of those to embeddings via Glove pre-trained model
3. Prepare input torch tensors for training ML Two Towers model

### **Step 1**: Initialising functions

In [77]:
import pandas as pd
def flatten_data(df):
    rows = []
    for _, row in df.iterrows():
        for i, passage_text in enumerate(row['passages.passage_text']):
            rows.append({
                'query': row['query'],
                'query_id': row['query_id'], 
                'query_type': row['query_type'],
                'document': passage_text,
                'passage_sign_de': 1,
                'passage_sign_ce': row['passages.is_selected'][i],
            })
    return pd.DataFrame(rows)

def add_negative_samples_with_input_id(df):
    result_rows = []
    input_id = 0
    
    # Group by query_id 
    grouped = df.groupby('query_id')
    
    for query_id, group in grouped:
        # Take up to 10 positive samples from this query
        positive_samples = group.head(10).copy()
        
        # Create 10 negative samples
        negative_samples = []
        for _ in range(10):
            # Random query from different query_id
            other_queries = df[df['query_id'] != query_id]
            random_query = other_queries.sample(1).iloc[0]
            
            # Random passage from anywhere
            random_passage = df['document'].sample(1).iloc[0]
            
            neg_sample = {
                'query': random_query['query'],
                'query_id': random_query['query_id'], 
                'query_type': random_query['query_type'],
                'document': random_passage,
                'passage_sign_de': 0,
                'passage_sign_ce': None
            }
            negative_samples.append(neg_sample)
        
        # Add input_id to positive samples
        for _, row in positive_samples.iterrows():
            row_dict = row.to_dict()
            row_dict['input_id'] = input_id
            result_rows.append(row_dict)
            
        # Add input_id to negative samples  
        for neg_sample in negative_samples:
            neg_sample['input_id'] = input_id
            result_rows.append(neg_sample)
            
        input_id += 1
    
    return pd.DataFrame(result_rows)

def add_negative_samples_same_query(df):
    """
    Modified version where negative samples use the same query as positive samples,
    but with non-relevant passages from other queries.
    """
    result_rows = []
    input_id = 0
    
    # Group by query_id 
    grouped = df.groupby('query_id')
    
    for query_id, group in grouped:
        # Take up to 10 positive samples from this query
        positive_samples = group.head(10).copy()
        
        # Get the query info from the first positive sample
        first_sample = positive_samples.iloc[0]
        query_text = first_sample['query']
        query_type = first_sample['query_type']
        
        # Create 10 negative samples with SAME query but different passages
        negative_samples = []
        for _ in range(10):
            # Random passage from different query_id (non-relevant passage)
            other_queries = df[df['query_id'] != query_id]
            random_passage = other_queries['document'].sample(1).iloc[0]
            
            neg_sample = {
                'query': query_text,  # Same query as positive samples
                'query_id': query_id,  # Same query_id as positive samples
                'query_type': query_type,  # Same query_type as positive samples
                'document': random_passage,  # Random non-relevant passage
                'passage_sign_de': 0,
                'passage_sign_ce': None
            }
            negative_samples.append(neg_sample)
        
        # Add input_id to positive samples
        for _, row in positive_samples.iterrows():
            row_dict = row.to_dict()
            row_dict['input_id'] = input_id
            result_rows.append(row_dict)
            
        # Add input_id to negative samples  
        for neg_sample in negative_samples:
            neg_sample['input_id'] = input_id
            result_rows.append(neg_sample)
            
        input_id += 1
    
    return pd.DataFrame(result_rows)


### **Step 2**: Get MS Marco Dataset

In [43]:
# Loading Raw
NUMBER_OF_SAMPLES = 500
EMBEDDING_DIM = 200

# Load and sample data FIRST
print("Loading raw data...")
df = pd.read_parquet("../data/ms_marco_train.parquet", engine='fastparquet')

print(f"📊 Sampling {NUMBER_OF_SAMPLES:,} samples...")

df_sample = df.sample(n=NUMBER_OF_SAMPLES, random_state=42).copy()
print(f"Sampled: {len(df_sample)} samples")



Loading raw data...
📊 Sampling 500 samples...
Sampled: 500 samples


### **Step 3**: Transform MS Marco Dataset and include negative sampling

In [79]:
# Apply filtering AFTER sampling
print("Filtering data...")
df_filtered = df_sample[
         (df_sample['query'].notna()) &
         (df_sample['query_id'].notna()) &
         (df_sample['query_type'].notna()) &
         (df_sample['passages.is_selected'].notna()) &
         (df_sample['passages.is_selected'].apply(lambda x: 1 in x))
     ].copy()

print(f"After filtering: {len(df_filtered)} samples")

# Create the wide contrastive table using simplified approach
transformed_df = flatten_data(df_filtered)



# Use it like this:
df_with_negatives = add_negative_samples_with_input_id(transformed_df)
df_final = add_negative_samples_same_query(df_with_negatives)

pd.set_option('display.max_columns', None)
print(df_final.head(40))

Filtering data...
After filtering: 320 samples
                                                query  query_id   query_type  \
0   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
1   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
2   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
3   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
4   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
5   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
6   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
7   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
8   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
9   Determine the significance of plasmids for bac...      2718  DESCRIPTION   
10  Determine the significance of plasmids for bac...      2718  DESCRIPT

### **Step 4**: Embed all data

In [82]:
import os
import pickle
import numpy as np
import re
import pandas as pd

def load_glove_embeddings():
    """Load cached GloVe embeddings."""
    cache_dir = os.path.dirname("../data/")
    word_to_idx_path = os.path.join(cache_dir, "word_to_idx.pkl")
    embeddings_path = os.path.join(cache_dir, "embeddings.npy")
    
    if os.path.exists(word_to_idx_path) and os.path.exists(embeddings_path):
        with open(word_to_idx_path, 'rb') as f:
            word_to_idx = pickle.load(f)
        embeddings = np.load(embeddings_path)
        return word_to_idx, embeddings
    
    print("❌ GloVe cache not found. Please run original data processing once to create cache.")
    return None, None


In [83]:
import torch

def clean_text(text):
    """Clean and tokenize text."""
    if not text or pd.isna(text):
        return []
    
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
    return text.split()

def text_to_embedding(text, word_to_idx, embeddings, max_len=50):
    """Convert text to embedding using GloVe embeddings with max pooling."""
    if not text or pd.isna(text):
        return np.zeros(embeddings.shape[1])
    
    words = clean_text(text)[:max_len]  # Limit sequence length
    word_embeddings = []
    
    for word in words:
        if word in word_to_idx:
            word_embeddings.append(embeddings[word_to_idx[word]])
    
    if not word_embeddings:
        return np.zeros(embeddings.shape[1])
    
    # Max pooling across words
    return np.max(word_embeddings, axis=0)

def embed_dataframe(df, word_to_idx, embeddings):
    """Embed query and document columns in the dataframe."""
    print(f"🔄 Embedding {len(df)} rows...")
    
    # Embed queries
    print("Embedding queries...")
    query_embeddings = []
    for i, query in enumerate(df['query']):
        if i % 1000 == 0:
            print(f"  Query {i}/{len(df)}")
        query_emb = text_to_embedding(query, word_to_idx, embeddings)
        query_embeddings.append(query_emb)
    
    # Embed documents
    print("Embedding documents...")
    document_embeddings = []
    for i, document in enumerate(df['document']):
        if i % 1000 == 0:
            print(f"  Document {i}/{len(df)}")
        doc_emb = text_to_embedding(document, word_to_idx, embeddings)
        document_embeddings.append(doc_emb)
    
    # Add embeddings to dataframe
    df_embedded = df.copy()
    df_embedded['query_embedding'] = query_embeddings
    df_embedded['document_embedding'] = document_embeddings
    
    print("✅ Embedding complete!")
    return df_embedded


In [84]:
# Load GloVe embeddings
print("📥 Loading GloVe embeddings...")
word_to_idx, embeddings = load_glove_embeddings()

# If no cached embeddings found, create dummy ones for testing
if word_to_idx is None:
    print("⚠️ Creating dummy embeddings for testing...")
    vocab_size = 10000
    embedding_dim = 200  # Match your EMBEDDING_DIM
    
    # Create dummy vocabulary with common words
    common_words = ['what', 'is', 'the', 'how', 'where', 'when', 'why', 'who', 'are', 'does', 'do', 
                   'this', 'that', 'these', 'those', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at',
                   'for', 'with', 'by', 'from', 'to', 'of', 'about', 'blood', 'test', 'doctor', 'cell',
                   'bacteria', 'plasmid', 'medical', 'health', 'significance', 'determine']
    
    word_to_idx = {}
    for i, word in enumerate(common_words):
        word_to_idx[word] = i
    
    # Fill remaining slots with dummy words
    for i in range(len(common_words), vocab_size):
        word_to_idx[f"word_{i}"] = i
    
    # Create random embeddings
    embeddings = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
    print(f"Created dummy embeddings: {embeddings.shape}")
else:
    print(f"✅ Successfully loaded GloVe embeddings: {embeddings.shape}")

# Embed the df_final dataframe
print("\\n🚀 Starting embedding process...")
df_embedded = embed_dataframe(df_final, word_to_idx, embeddings)

print(f"\\n📊 Results:")
print(f"Original dataframe shape: {df_final.shape}")
print(f"Embedded dataframe shape: {df_embedded.shape}")
print(f"Embedding dimension: {len(df_embedded['query_embedding'].iloc[0])}")

# Show sample embeddings
print("\\n🔍 Sample embeddings:")
print(f"First query: '{df_embedded['query'].iloc[0][:50]}...'")
print(f"Query embedding shape: {df_embedded['query_embedding'].iloc[0].shape}")
print(f"Document embedding shape: {df_embedded['document_embedding'].iloc[0].shape}")
print(f"Query embedding preview: {df_embedded['query_embedding'].iloc[0][:5]}")


📥 Loading GloVe embeddings...
✅ Successfully loaded GloVe embeddings: (400000, 200)
\n🚀 Starting embedding process...
🔄 Embedding 6400 rows...
Embedding queries...
  Query 0/6400
  Query 1000/6400
  Query 2000/6400
  Query 3000/6400
  Query 4000/6400
  Query 5000/6400
  Query 6000/6400
Embedding documents...
  Document 0/6400
  Document 1000/6400
  Document 2000/6400
  Document 3000/6400
  Document 4000/6400
  Document 5000/6400
  Document 6000/6400
✅ Embedding complete!
\n📊 Results:
Original dataframe shape: (6400, 7)
Embedded dataframe shape: (6400, 9)
Embedding dimension: 200
\n🔍 Sample embeddings:
First query: 'Determine the significance of plasmids for bacteri...'
Query embedding shape: (200,)
Document embedding shape: (200,)
Query embedding preview: [0.45524 0.86093 0.45565 0.59798 0.22503]


In [85]:
# Organize embedded data for Two Tower training
print("🔧 Organizing data for Two Tower model training...")

# Group by input_id to create query-document pairs
grouped_data = df_embedded.groupby('input_id')

query_embeddings_list = []
positive_doc_embeddings_list = []
negative_doc_embeddings_list = []
targets_list = []

for input_id, group in grouped_data:
    # Get positive samples (passage_sign_de = 1)
    positive_samples = group[group['passage_sign_de'] == 1]
    # Get negative samples (passage_sign_de = 0) 
    negative_samples = group[group['passage_sign_de'] == 0]
    
    if len(positive_samples) > 0 and len(negative_samples) > 0:
        # Use the query from the first positive sample
        query_emb = positive_samples['query_embedding'].iloc[0]
        query_embeddings_list.append(query_emb)
        
        # Get positive document embeddings (limit to 10)
        pos_doc_embs = positive_samples['document_embedding'].tolist()[:10]
        # Pad with zeros if less than 10
        while len(pos_doc_embs) < 10:
            pos_doc_embs.append(np.zeros_like(pos_doc_embs[0] if pos_doc_embs else query_emb))
        positive_doc_embeddings_list.append(pos_doc_embs[:10])
        
        # Get negative document embeddings (limit to 10)
        neg_doc_embs = negative_samples['document_embedding'].tolist()[:10]
        # Pad with zeros if less than 10
        while len(neg_doc_embs) < 10:
            neg_doc_embs.append(np.zeros_like(neg_doc_embs[0] if neg_doc_embs else query_emb))
        negative_doc_embeddings_list.append(neg_doc_embs[:10])
        
        # Create targets: [positive_targets, negative_targets]
        pos_targets = [1] * 10
        neg_targets = [0] * 10
        targets_list.append([pos_targets, neg_targets])

# Convert to PyTorch tensors
query_embeddings_tensor = torch.tensor(np.array(query_embeddings_list), dtype=torch.float32)
positive_embeddings_tensor = torch.tensor(np.array(positive_doc_embeddings_list), dtype=torch.float32)
negative_embeddings_tensor = torch.tensor(np.array(negative_doc_embeddings_list), dtype=torch.float32)
targets_tensor = torch.tensor(np.array(targets_list), dtype=torch.float32)

print("\\n✅ PyTorch tensors ready for Two Tower training!")
print(f"Query embeddings: {query_embeddings_tensor.shape}")
print(f"Positive document embeddings: {positive_embeddings_tensor.shape}")
print(f"Negative document embeddings: {negative_embeddings_tensor.shape}")
print(f"Targets: {targets_tensor.shape}")

print("\\n🎯 Training data structure:")
print(f"- {len(query_embeddings_tensor)} training samples")
print(f"- Each query has 10 positive + 10 negative documents")  
print(f"- Embedding dimension: {query_embeddings_tensor.shape[1]}")
print(f"- Positive targets: {targets_tensor[0][0]}")
print(f"- Negative targets: {targets_tensor[0][1]}")

print("\\n💾 Saving tensors...")
torch.save({
    'query_embeddings': query_embeddings_tensor,
    'positive_embeddings': positive_embeddings_tensor, 
    'negative_embeddings': negative_embeddings_tensor,
    'targets': targets_tensor,
    'embedding_dim': query_embeddings_tensor.shape[1]
}, '../data/embedded_training_data.pt')

print("✅ Saved to '../data/embedded_training_data.pt'")
print("\\n🚀 Ready for Two Tower model training!")


🔧 Organizing data for Two Tower model training...
\n✅ PyTorch tensors ready for Two Tower training!
Query embeddings: torch.Size([284, 200])
Positive document embeddings: torch.Size([284, 10, 200])
Negative document embeddings: torch.Size([284, 10, 200])
Targets: torch.Size([284, 2, 10])
\n🎯 Training data structure:
- 284 training samples
- Each query has 10 positive + 10 negative documents
- Embedding dimension: 200
- Positive targets: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
- Negative targets: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
\n💾 Saving tensors...
✅ Saved to '../data/embedded_training_data.pt'
\n🚀 Ready for Two Tower model training!
