### **Step 1**: Import configs and hyperparameters

In [1]:
import pandas as pd
import json
import numpy as np
from backend.data_processing import flatten_data, add_negative_samples_fast, to_triplets, filter_valid_data, convert_to_training_format
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import random

# Load config
with open('../backend/config.json', 'r') as f:
    config = json.load(f)

# Dataset paths
datasets = {
    'train': config['TRAIN_DATASET_PATH'],
    'validation': config['VAL_DATASET_PATH'], 
    'test': config['TEST_DATASET_PATH']
}

### **Step 2**: Process data to get triplets for train, validation and test datasets

In [2]:

# Sampling configuration - modify these as needed
print(f"📊 SAMPLING CONFIGURATION:")
print(f"  Total samples to process: {config['TOTAL_SAMPLES']:,}")
print(f"  Train split: {config['TRAIN_SPLIT']*100:.0f}% ({int(config['TOTAL_SAMPLES']*config['TRAIN_SPLIT']):,} samples)")
print(f"  Test split: {config['TEST_SPLIT']*100:.0f}% ({int(config['TOTAL_SAMPLES']*config['TEST_SPLIT']):,} samples)")
print(f"  Validation split: {(1-config['TRAIN_SPLIT']-config['TEST_SPLIT'])*100:.0f}% ({int(config['TOTAL_SAMPLES']*(1-config['TRAIN_SPLIT']-config['TEST_SPLIT'])):,} samples)")

results = {}

print("\nProcessing datasets to triplet format...")
print("="*50)

# Calculate samples per dataset
samples_per_dataset = {
    'train': int(config['TOTAL_SAMPLES'] * config['TRAIN_SPLIT']),
    'test': int(config['TOTAL_SAMPLES'] * config['TEST_SPLIT']),
    'validation': int(config['TOTAL_SAMPLES'] * (1 - config['TRAIN_SPLIT'] - config['TEST_SPLIT']))
}

for name, input_path in datasets.items():
    target_samples = samples_per_dataset[name]
    print(f"\n📁 Processing {name.upper()} dataset (target: {target_samples:,} samples)...")
    print(f"Loading: {input_path}")
    
    # Step 1: Load data
    df = pd.read_parquet(input_path, engine='fastparquet')
    print(f"  Loaded: {len(df):,} samples")
    
    # Step 2: Early sampling - cut here to save processing time
    if len(df) > target_samples:
        df = df.sample(n=target_samples, random_state=42).reset_index(drop=True)
        print(f"  ✂️ Sampled down to: {len(df):,} samples")
    
    # Step 3: Filter valid data
    df_filtered = filter_valid_data(df)
    print(f"  After filtering: {len(df_filtered):,} samples")
    
    # Step 4: Flatten data (nested passages to flat rows)
    print("  🔄 Flattening data...")
    flattened = flatten_data(df_filtered)
    print(f"  Flattened: {len(flattened):,} rows")
    
    # Step 5: Add negative samples
    print("  ➕ Adding negative samples...")
    with_negatives = add_negative_samples_fast(flattened)
    print(f"  With negatives: {len(with_negatives):,} rows")
    print(f"    - Positive: {sum(with_negatives['passage_sign_de'] == 1):,}")
    print(f"    - Negative: {sum(with_negatives['passage_sign_de'] == 0):,}")
    
    # Step 6: Convert to triplets
    print("  🔄 Converting to triplets...")
    triplets = to_triplets(with_negatives, triplets_per_query=10)
    print(f"  Final triplets: {len(triplets):,}")
    print(f"  Unique queries: {triplets['query'].nunique()}")
    
    # Store result
    results[name] = triplets
    print(f"  ✅ {name.upper()} datasetcompleted!")

print("\n" + "="*50)
print("FINAL SUMMARY")
print("="*50)
total_triplets = 0
for name, triplets_df in results.items():
    triplets_count = len(triplets_df)
    total_triplets += triplets_count
    print(f"{name.upper()}: {triplets_count:,} triplets, {triplets_df['query'].nunique():,} unique queries")

print(f"\n🎯 TOTAL TRIPLETS: {total_triplets:,}")

print("\n🎯 Sample triplet from train dataset:")
if 'train' in results and len(results['train']) > 0:
    sample = results['train'].iloc[0]
    print(f"Query: {sample['query'][:80]}...")
    print(f"Positive: {sample['positive_example'][:80]}...")
    print(f"Negative: {sample['negative_example'][:80]}...")

print("\n✅ All datasets processed! Results stored in 'results' dictionary.")
print("Access with: results['train'], results['validation'], results['test']")

📊 SAMPLING CONFIGURATION:
  Total samples to process: 50,000
  Train split: 70% (35,000 samples)
  Test split: 20% (10,000 samples)
  Validation split: 10% (5,000 samples)

Processing datasets to triplet format...

📁 Processing TRAIN dataset (target: 35,000 samples)...
Loading: ../data/ms_marco_train.parquet
  Loaded: 808,731 samples
  ✂️ Sampled down to: 35,000 samples
  After filtering: 21,651 samples
  🔄 Flattening data...
  Flattened: 216,088 rows
  ➕ Adding negative samples...
  With negatives: 432,483 rows
    - Positive: 215,973
    - Negative: 216,510
  🔄 Converting to triplets...
  Final triplets: 216,510
  Unique queries: 21651
  ✅ TRAIN completed!

📁 Processing VALIDATION dataset (target: 5,000 samples)...
Loading: ../data/ms_marco_validation.parquet
  Loaded: 101,093 samples
  ✂️ Sampled down to: 5,000 samples
  After filtering: 2,723 samples
  🔄 Flattening data...
  Flattened: 27,202 rows
  ➕ Adding negative samples...
  With negatives: 54,412 rows
    - Positive: 27,182
 

### **Step 3**: Select sub-sample triplets for training, validation and testing

In [12]:
from backend.data_processing import convert_to_training_format

# Convert processed results to training format (no subsampling needed - already done!)
train_data = convert_to_training_format(results['train'])
val_data = convert_to_training_format(results['validation']) 
test_data = convert_to_training_format(results['test'])

# Print sample to verify format
print("Sample training triplet:")
print(f"Query: {train_data[0][0][:100]}...")
print(f"Positive: {train_data[0][1][:100]}...")  
print(f"Negative: {train_data[0][2][:100]}...")
print(f"\nDataset sizes:")
print(f"  Training: {len(train_data):,} triplets")
print(f"  Validation: {len(val_data):,} triplets")
print(f"  Test: {len(test_data):,} triplets")

# Use training data for the model
data = train_data

Sample training triplet:
Query:  phosphates as food ingredients ...
Positive: Call Us Toll-Free 1.800.578.6800. In many ways phosphates are unsung heroes of food ingredients and ...
Negative: When the engine is hot, the clutch fan runs nearly as fast as the engine. When the engine is cold, t...

Dataset sizes:
  Training: 216,510 triplets
  Validation: 27,230 triplets
  Test: 100,000 triplets


### **Step 4**: Select sub-sample triplets for training, validation and testing

In [13]:
# --- Tokenizer and Vocab ---
import pickle
import numpy as np
from collections import defaultdict
from itertools import chain

class PretrainedTokenizer:
    def __init__(self, word_to_idx_path):
        # Load pretrained word_to_idx mapping
        with open(word_to_idx_path, 'rb') as f:
            self.word2idx = pickle.load(f)
        
        print(f"Loaded vocabulary with {len(self.word2idx):,} tokens")

    def encode(self, sentence):
        # Only include words that exist in vocabulary, skip unknown words
        return [self.word2idx[word.lower()] for word in sentence.split() if word.lower() in self.word2idx]

    def vocab_size(self):
        return len(self.word2idx)


In [14]:
# Load pretrained tokenizer
tokenizer = PretrainedTokenizer(config['WORD_TO_IDX_PATH'])

Loaded vocabulary with 400,000 tokens


In [15]:
# --- Dataset Class ---
class TripletDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query, pos_doc, neg_doc = self.data[idx]
        return (torch.tensor(self.tokenizer.encode(query)),
                torch.tensor(self.tokenizer.encode(pos_doc)),
                torch.tensor(self.tokenizer.encode(neg_doc)))

In [16]:
# --- Collate Function ---
def collate_fn(batch):
    queries, pos_docs, neg_docs = zip(*batch)
    return (
        pad_sequence(queries, batch_first=True),
        pad_sequence(pos_docs, batch_first=True),
        pad_sequence(neg_docs, batch_first=True)
    )

In [17]:
# --- Dual RNN Encoder Model ---
class RNNEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Load pretrained embeddings if provided
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            # Keep embeddings trainable (they are by default)
            
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.rnn(x)
        return h_n.squeeze(0)  # shape: (batch, hidden_dim)

In [18]:
# --- Triplet Loss Function ---
def triplet_loss_function(triplet, distance_function, margin):
    query, pos_doc, neg_doc = triplet
    d_pos = distance_function(query, pos_doc)
    d_neg = distance_function(query, neg_doc)
    return torch.clamp(d_pos - d_neg + margin, min=0.0).mean()


In [19]:
# --- Training Setup ---
VOCAB_SIZE = tokenizer.vocab_size()

# Load pretrained embeddings
pretrained_embeddings = np.load(config['EMBEDDINGS_PATH'])
EMBED_DIM = pretrained_embeddings.shape[1]  # Get embedding dimension from loaded embeddings

print(f"Loaded pretrained embeddings: {pretrained_embeddings.shape}")
print(f"Vocabulary size: {VOCAB_SIZE}")
print(f"Embedding dimension: {EMBED_DIM}")

# Initialize encoders with pretrained embeddings
query_encoder = RNNEncoder(VOCAB_SIZE, EMBED_DIM, config['HIDDEN_DIM'], pretrained_embeddings)
doc_encoder = RNNEncoder(VOCAB_SIZE, EMBED_DIM, config['HIDDEN_DIM'], pretrained_embeddings)

optimizer = torch.optim.Adam(list(query_encoder.parameters()) + list(doc_encoder.parameters()), lr=config['LR'])
dataset = TripletDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=config['BATCH_SIZE'], shuffle=True, collate_fn=collate_fn)

Loaded pretrained embeddings: (400000, 200)
Vocabulary size: 400000
Embedding dimension: 200


In [None]:
# --- Training Loop ---
for epoch in range(config['EPOCHS']):
    total_loss = 0
    for query_batch, pos_batch, neg_batch in dataloader:
        q_vec = query_encoder(query_batch)
        pos_vec = doc_encoder(pos_batch)
        neg_vec = doc_encoder(neg_batch)

        loss = triplet_loss_function((q_vec, pos_vec, neg_vec), F.pairwise_distance, config['MARGIN'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [11]:
# --- Inference Function ---
def search(query_text, documents, tokenizer, query_encoder, doc_encoder):
    with torch.no_grad():
        query_tensor = pad_sequence([torch.tensor(tokenizer.encode(query_text))], batch_first=True)
        query_vec = query_encoder(query_tensor)

        doc_tensors = pad_sequence([torch.tensor(tokenizer.encode(doc)) for doc in documents], batch_first=True)
        doc_vecs = doc_encoder(doc_tensors)

        scores = F.cosine_similarity(query_vec, doc_vecs)
        top_indices = torch.argsort(scores, descending=True)
        return [(documents[i], scores[i].item()) for i in top_indices]

In [None]:
# --- Example Usage ---
documents = ["deep learning applications", "banana smoothie", "introduction to ai", "machine learning in banking"]
results = search("what is ai", documents, tokenizer, query_encoder, doc_encoder)

print("\nSearch results:")
for doc, score in results:
    print(f"{doc} (score: {score:.4f})")
