# **PHASE # 1: TRAINING**

## **Required Libraries**

In [None]:
import time
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pyarrow
import builtins
from tqdm import tqdm
import nltk
import math
import matplotlib.pyplot as plt
import os
import torch
from nltk.corpus import stopwords
from annoy import AnnoyIndex
import faiss
from datasets import Dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

## **Loading the Datasets**

In [3]:
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None)

news_df.columns = [
    "news_id", "category", "subcategory", "title", "abstract",
    "url", "title_entities", "abstract_entities"
]

behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None)

behaviors_df.columns = [
    "impression_id", "user_id", "timestamp", "history", "impressions"
]

news_df.head(5)

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [4]:
behaviors_df.head(5)

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [5]:
behaviors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   impression_id  156965 non-null  int64 
 1   user_id        156965 non-null  object
 2   timestamp      156965 non-null  object
 3   history        153727 non-null  object
 4   impressions    156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


In [6]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   news_id            51282 non-null  object
 1   category           51282 non-null  object
 2   subcategory        51282 non-null  object
 3   title              51282 non-null  object
 4   abstract           48616 non-null  object
 5   url                51282 non-null  object
 6   title_entities     51279 non-null  object
 7   abstract_entities  51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


## **Preprocessing**

In [7]:
# Fill missing title/abstract with empty strings
news_df['title'] = news_df['title'].fillna('')
news_df['abstract'] = news_df['abstract'].fillna('')

In [8]:
# Combine title and abstract into a single 'text' column
news_df['text'] = news_df['title'] + ". " + news_df['abstract']

In [9]:
# Remove duplicate articles based on the combined text
news_df = news_df.drop_duplicates(subset='text')

In [10]:
# Remove rows where 'text' is empty or just whitespace
news_df = news_df[news_df['text'].str.strip() != '']

## **Splitting the Data into Train and Test Sets**

In [None]:
print("\n--- Limiting Data to First 10,000 Rows ---")

# Limit news_df to first 10,000 rows
# Using .copy() to ensure we're working on a separate DataFrame to avoid SettingWithCopyWarning
news_df_limited = news_df.head(10000).copy().reset_index(drop=True)
print(f"Limited News Data Shape: {news_df_limited.shape}")

# Limit behaviors_df to first 10,000 rows
behaviors_df_limited = behaviors_df.head(10000).copy().reset_index(drop=True)
print(f"Limited Behaviors Data Shape: {behaviors_df_limited.shape}")


print("\n--- Splitting Limited Datasets into 80/20 Train/Test ---")

# Split news_df_limited (articles)
# IMPORTANT: Removed 'stratify=news_df_limited['category']' to prevent ValueError
# as small subsets might have categories with only 1 member.
news_df_train, news_df_test = train_test_split(news_df_limited, test_size=0.2, random_state=42)
# Reset index to ensure continuous indexing for embedding lookup later
news_df_train = news_df_train.reset_index(drop=True)
news_df_test = news_df_test.reset_index(drop=True)

print(f"News Train Data Shape (from 10k): {news_df_train.shape}")
print(f"News Test Data Shape (from 10k): {news_df_test.shape}")

# Split behaviors_df_limited (user interactions) by user_id
# This ensures that users are distinct between train and test sets for evaluation.
unique_users_limited = behaviors_df_limited['user_id'].unique()
train_users, test_users = train_test_split(unique_users_limited, test_size=0.2, random_state=42)

behaviors_df_train = behaviors_df_limited[behaviors_df_limited['user_id'].isin(train_users)].reset_index(drop=True)
behaviors_df_test = behaviors_df_limited[behaviors_df_limited['user_id'].isin(test_users)].reset_index(drop=True)

print(f"Behaviors Train Data Shape (from 10k): {behaviors_df_train.shape}")
print(f"Behaviors Test Data Shape (from 10k): {behaviors_df_test.shape}")

# It's good practice to save the test sets for later use in Phase 2 (Inference/Evaluation)
# Using a new directory name for clarity, indicating it's a 10k subset.
try:
    os.makedirs('MINDsmall_test_10k_subset', exist_ok=True)
    news_df_test.to_parquet('MINDsmall_test_10k_subset/news_test_10k.parquet', index=False)
    behaviors_df_test.to_parquet('MINDsmall_test_10k_subset/behaviors_test_10k.parquet', index=False)
    print("✅ Limited Test datasets saved to 'MINDsmall_test_10k_subset/'")
except Exception as e:
    print(f"⚠️ Could not save limited test datasets: {e}. Please ensure directory exists.")

print("--- Dataset Splitting Completed ---")


--- Splitting Datasets into 80/20 Train/Test ---
News Train Data Shape: (40535, 9)
News Test Data Shape: (10134, 9)
Behaviors Train Data Shape: (125634, 5)
Behaviors Test Data Shape: (31331, 5)
✅ Test datasets saved to 'MINDsmall_test/'
--- Dataset Splitting Completed ---


## **Add Validation Split**

In [None]:
# New section for splitting data into actual train/validation sets
print("\n--- Creating Validation Split from Training Data (80/20 for actual train/val) ---")

# Split news_df_train for actual training and validation articles
# (Assuming news_df_train is already created from the 80/20 split of the 10k subset)
news_df_actual_train, news_df_val = train_test_split(news_df_train, test_size=0.2, random_state=42)
news_df_actual_train = news_df_actual_train.reset_index(drop=True)
news_df_val = news_df_val.reset_index(drop=True)

print(f"News Actual Train Data Shape: {news_df_actual_train.shape}")
print(f"News Validation Data Shape: {news_df_val.shape}")

# Split behaviors_df_train for actual training and validation user interactions
# Use user-based split for consistency and proper evaluation.
# (Assuming behaviors_df_train is already created from the 80/20 split of the 10k subset)
unique_users_train = behaviors_df_train['user_id'].unique()
actual_train_users, val_users = train_test_split(unique_users_train, test_size=0.2, random_state=42)

behaviors_df_actual_train = behaviors_df_train[behaviors_df_train['user_id'].isin(actual_train_users)].reset_index(drop=True)
behaviors_df_val = behaviors_df_train[behaviors_df_train['user_id'].isin(val_users)].reset_index(drop=True)

print(f"Behaviors Actual Train Data Shape: {behaviors_df_actual_train.shape}")
print(f"Behaviors Validation Data Shape: {behaviors_df_val.shape}")

print("--- Validation Split Completed ---")

## **Add Evaluation Metrics Functions**

In [None]:
import math # Ensure math is imported globally

def get_true_relevant_items_val(behaviors_df_val):
    true_relevant_items_dict = {}
    for _, row in behaviors_df_val.iterrows():
        user_id = row['user_id']
        if pd.isna(row['impressions']): continue
        relevant_items = [item.split('-')[0] for item in row['impressions'].split() if item.endswith('1')]
        if relevant_items:
            true_relevant_items_dict[user_id] = relevant_items
    return true_relevant_items_dict

def get_user_history_val(behaviors_df_val):
    user_history_dict = {}
    for _, row in behaviors_df_val.iterrows():
        user_id = row['user_id']
        if pd.isna(row['history']): continue
        history_items = row['history'].split()
        if history_items:
            user_history_dict[user_id] = history_items
    return user_history_dict

def calculate_precision_recall_ndcg_map(recommended_items_dict, true_relevant_items_dict, k=10):
    """
    Calculates Precision@k, Recall@k, NDCG@k, and MAP for a set of recommendations.

    Args:
        recommended_items_dict (dict): Dictionary mapping user_id to a list of recommended news_ids (ranked).
        true_relevant_items_dict (dict): Dictionary mapping user_id to a list of true relevant news_ids.
        k (int): The cutoff for top-k recommendations.

    Returns:
        dict: A dictionary containing average Precision@k, Recall@k, NDCG@k, and MAP.
    """
    precisions = []
    recalls = []
    ndcgs = []
    maps = []

    for user_id in true_relevant_items_dict.keys():
        if user_id not in recommended_items_dict or not recommended_items_dict[user_id]:
            continue

        recs = recommended_items_dict[user_id][:k]
        true_relevants = set(true_relevant_items_dict[user_id])

        # Precision@k
        num_relevant_in_k = len(set(recs) & true_relevants)
        precision_at_k = num_relevant_in_k / k if k > 0 else 0
        precisions.append(precision_at_k)

        # Recall@k
        recall_at_k = num_relevant_in_k / len(true_relevants) if len(true_relevants) > 0 else 0
        recalls.append(recall_at_k)

        # NDCG@k
        dcg = 0.0
        idcg = 0.0
        for i, item_id in enumerate(recs):
            if item_id in true_relevants:
                dcg += 1.0 / math.log2(i + 2) # i+1 is rank, so i+2 for log2

        ideal_relevances = sorted([1.0 for _ in true_relevants], reverse=True) # Assume relevance is 1 for relevant items
        for i, relevance in enumerate(ideal_relevances[:k]):
            idcg += relevance / math.log2(i + 2)

        ndcg_at_k = dcg / idcg if idcg > 0 else 0.0
        ndcgs.append(ndcg_at_k)

        # Average Precision (for MAP)
        current_precision_sum = 0.0
        num_hits = 0
        for i, item_id in enumerate(recs):
            if item_id in true_relevants:
                num_hits += 1
                current_precision_sum += num_hits / (i + 1)
        
        avg_precision = current_precision_sum / len(true_relevants) if len(true_relevants) > 0 else 0.0
        maps.append(avg_precision)


    avg_precision_at_k = np.mean(precisions) if precisions else 0
    avg_recall_at_k = np.mean(recalls) if recalls else 0
    avg_ndcg_at_k = np.mean(ndcgs) if ndcgs else 0
    avg_map = np.mean(maps) if maps else 0

    return {
        f"Precision@{k}": avg_precision_at_k,
        f"Recall@{k}": avg_recall_at_k,
        f"NDCG@{k}": avg_ndcgs,
        f"MAP@{k}": avg_map
    }

def simulate_recommendations_for_validation(user_history_dict, embedding_model, tokenizer_model=None, faiss_index=None, news_df_for_val=None, k=10):
    """
    Simulates recommendations for a set of users based on their history during validation.
    Handles both SentenceTransformer and CLM (DeepSeek) models for user embedding generation.

    Args:
        user_history_dict (dict): Dictionary mapping user_id to list of historical news_ids.
        embedding_model: The SentenceTransformer or DeepSeek model instance.
        tokenizer_model: The tokenizer for CLM models (needed for DeepSeek).
        faiss_index: The Faiss index of article embeddings (should be from news_df_val).
        news_df_for_val (pd.DataFrame): The news DataFrame used for validation embeddings.
        k (int): Number of top recommendations to retrieve.

    Returns:
        dict: A dictionary mapping user_id to a list of recommended news_ids.
    """
    if faiss_index is None or news_df_for_val is None:
        raise ValueError("faiss_index and news_df_for_val must be provided for validation simulation.")

    recommended_items = {}
    news_id_to_idx = {news_id: idx for idx, news_id in enumerate(news_df_for_val['news_id'].tolist())}
    idx_to_news_id = {idx: news_id for news_id, idx in news_id_to_idx.items()}

    # Ensure model is in evaluation mode
    embedding_model.eval()

    for user_id, history_news_ids in tqdm(user_history_dict.items(), desc="Simulating Val Recs"):
        if not history_news_ids:
            continue

        # Get embeddings for historical articles
        history_texts = [news_df_for_val[news_df_for_val['news_id'] == nid]['text'].iloc[0]
                         for nid in history_news_ids if nid in news_df_for_val['news_id'].values]

        if not history_texts:
            continue

        user_embedding = None
        with torch.no_grad(): # Ensure no gradient calculation for inference
            if isinstance(embedding_model, SentenceTransformer):
                history_embeddings = embedding_model.encode(history_texts, convert_to_numpy=True)
                user_embedding = np.mean(history_embeddings, axis=0, keepdims=True)
            elif isinstance(embedding_model, AutoModelForCausalLM) and tokenizer_model:
                # DeepSeek user embedding generation using its pooling logic
                all_history_embeddings = []
                for text in history_texts:
                    inputs = tokenizer_model(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(embedding_model.device)
                    outputs = embedding_model(**inputs, output_hidden_states=True)
                    # Corrected typo here: outputs.hidden_hidden_states -> outputs.hidden_states
                    last_hidden_states = outputs.hidden_states[-1]
                    attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
                    masked_embeddings = last_hidden_states * attention_mask
                    sum_embeddings = torch.sum(masked_embeddings, 1)
                    sum_mask = torch.sum(attention_mask, 1)
                    mean_pooled_embedding = (sum_embeddings / torch.clamp(sum_mask, min=1e-9)).cpu().numpy()
                    all_history_embeddings.append(mean_pooled_embedding)
                
                if all_history_embeddings: # Check if any embeddings were generated
                    user_embedding = np.mean(np.vstack(all_history_embeddings), axis=0, keepdims=True)
            
            if user_embedding is None or (user_embedding.ndim == 1 and user_embedding.size == 0): # Handle empty user_embedding
                continue # Skip if no user embedding could be generated

            # Ensure user_embedding is a 2D array [1, dim] for faiss.normalize_L2
            if user_embedding.ndim == 1:
                user_embedding = np.expand_dims(user_embedding, axis=0)

            faiss.normalize_L2(user_embedding)

            # Search Faiss index for top-k
            # Retrieve more to filter out already seen items
            D, I = faiss_index.search(user_embedding, k + len(history_news_ids) + 5) # +5 buffer for safety
            
            # Filter out already seen items from recommendations
            seen_items = set(history_news_ids)
            # Ensure idx is valid before lookup and not seen
            top_k_indices = [idx for idx in I[0] if idx >= 0 and idx < len(idx_to_news_id) and idx_to_news_id.get(idx) and idx_to_news_id.get(idx) not in seen_items]
            
            recommended_news_ids = [idx_to_news_id[idx] for idx in top_k_indices[:k]]
            recommended_items[user_id] = recommended_news_ids

    return recommended_items

# --- Plotting helper function (can be placed here or after all training is done) ---
def plot_training_metrics(results, model_name, metric_k=10):
    epochs = range(1, len(results["train_losses"]) + 1)

    # Plot Training and Validation Loss Curves
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, results["train_losses"], label='Training Loss')
    plt.title(f'{model_name} - Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()

    # Plot Training Time per Epoch
    plt.subplot(1, 2, 2)
    plt.bar(epochs, results["epoch_times"])
    plt.title(f'{model_name} - Training Time per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Time (seconds)')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plot Validation Accuracy Curves (Precision@k, Recall@k, NDCG@k, MAP@k)
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, results["val_precisions"], label=f'Validation Precision@{metric_k}')
    plt.plot(epochs, results["val_recalls"], label=f'Validation Recall@{metric_k}')
    plt.plot(epochs, results["val_ndcgs"], label=f'Validation NDCG@{metric_k}')
    plt.plot(epochs, results["val_maps"], label=f'Validation MAP@{metric_k}')
    plt.title(f'{model_name} - Validation Metrics over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.grid(True)
    plt.legend()
    plt.show()

## **FINE TUNING**

### **Create Triplets for Fine Tuning**

In [11]:
def create_triplets(behaviors_df, news_dict, max_triplets=10000):
    triplets = []
    for _, row in behaviors_df.iterrows():
        if pd.isna(row["impressions"]): continue
        impressions = row["impressions"].split()
        pos = [i.split('-')[0] for i in impressions if i.endswith('1')]
        neg = [i.split('-')[0] for i in impressions if i.endswith('0')]
        if len(pos) < 2 or not neg: continue
        for i in range(len(pos)):
            a, p = pos[i], pos[(i+1)%len(pos)]
            for n in neg:
                triplets.append((news_dict.get(a), news_dict.get(p), news_dict.get(n)))
        if len(triplets) >= max_triplets:
            break
    return [InputExample(texts=[a, p, n]) for a, p, n in triplets if a and p and n]

### **Method to Fine Tune Models**

In [None]:
# Updated train_triplet_model
def train_triplet_model(model_name, triplets, model_output,
                        news_df_val, behaviors_df_val,
                        epochs=3, warmup_steps=100, eval_k=10): # Added parameters
    model = SentenceTransformer(model_name)

    if torch.cuda.is_available():
        model.to("cuda")
        print("✅ Model moved to GPU.")
    else:
        print("⚠️ CUDA not available. Training on CPU.")

    train_loader = DataLoader(triplets, shuffle=True, batch_size=8)
    loss_fn = losses.TripletLoss(model=model)

    train_losses = []
    val_precisions = []
    val_recalls = []
    val_ndcgs = []
    val_maps = []
    epoch_times = []

    true_relevant_items_val = get_true_relevant_items_val(behaviors_df_val)
    user_history_val = get_user_history_val(behaviors_df_val)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    num_training_steps = len(train_loader) * epochs
    warmup_steps_actual = int(num_training_steps * 0.1) if warmup_steps is None else warmup_steps

    print(f"\nStarting fine-tuning for {model_name}...")
    global_step = 0
    for epoch in range(epochs):
        start_time = time.time()
        model.train() # Set model to training mode
        epoch_train_losses = []

        for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1} Training"):
            features = model.tokenize(batch.texts)
            reps = model(features)
            loss = loss_fn(reps)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            epoch_train_losses.append(loss.item())
            global_step += 1

            if global_step < warmup_steps_actual:
                for pg in optimizer.param_groups:
                    pg['lr'] = (global_step / warmup_steps_actual) * 2e-5

        avg_train_loss = np.mean(epoch_train_losses)
        train_losses.append(avg_train_loss)
        print(f"Epoch {epoch+1} - Avg Training Loss: {avg_train_loss:.4f}")

        # --- Validation Evaluation (Accuracy Curves) ---
        model.eval() # Set model to evaluation mode
        with torch.no_grad():
            # Generate embeddings for validation news_df_val articles using current model state
            val_news_texts_for_embedding = news_df_val['text'].tolist()
            current_val_embeddings = model.encode(val_news_texts_for_embedding, show_progress_bar=False, convert_to_numpy=True)
            # Create a temporary Faiss index for validation search
            val_faiss_index = faiss.IndexFlatIP(current_val_embeddings.shape[1])
            faiss.normalize_L2(current_val_embeddings)
            val_faiss_index.add(current_val_embeddings)

            # Simulate recommendations for validation users
            val_recommended_items = simulate_recommendations_for_validation(
                user_history_val,
                model, # Pass the current model instance
                None, # Tokenizer not needed for SentenceTransformer
                val_faiss_index,
                news_df_val, # Pass news_df_val for text lookup
                k=eval_k
            )

            # Calculate metrics
            metrics = calculate_precision_recall_ndcg_map(
                val_recommended_items,
                true_relevant_items_val,
                k=eval_k
            )
            val_precisions.append(metrics[f"Precision@{eval_k}"])
            val_recalls.append(metrics[f"Recall@{eval_k}"])
            val_ndcgs.append(metrics[f"NDCG@{eval_k}"])
            val_maps.append(metrics[f"MAP@{eval_k}"])
            print(f"Epoch {epoch+1} - Val Precision@{eval_k}: {metrics[f'Precision@{eval_k}']:.4f}, "
                  f"Val Recall@{eval_k}: {metrics[f'Recall@{eval_k}']:.4f}, "
                  f"Val NDCG@{eval_k}: {metrics[f'NDCG@{eval_k}']:.4f}, "
                  f"Val MAP@{eval_k}: {metrics[f'MAP@{eval_k}']:.4f}")

        end_time_epoch = time.time()
        epoch_times.append(end_time_epoch - start_time)
        print(f"Epoch {epoch+1} Time: {epoch_times[-1]:.2f} seconds")

    model.save(model_output)
    print(f"✅ Model Saved to '{model_output}'")

    return {
        "train_losses": train_losses,
        "val_precisions": val_precisions,
        "val_recalls": val_recalls,
        "val_ndcgs": val_ndcgs,
        "val_maps": val_maps,
        "epoch_times": epoch_times
    }

### **Models to Fine Tune - MiniLM and BERT**

In [None]:
# Updated Models to Fine Tune section
news_dict_actual_train = dict(zip(news_df_actual_train['news_id'], news_df_actual_train['text']))

triplets_initial = create_triplets(behaviors_df_actual_train, news_dict_actual_train)

# Capture training results for MiniLM
minilm_initial_results = train_triplet_model(
    "all-MiniLM-L6-v2",
    triplets_initial,
    "fine_tuned_minilm",
    news_df_val, # Pass validation data
    behaviors_df_val, # Pass validation data
    epochs=3 # Using 3 epochs as discussed
)

# Capture training results for BERT
bert_initial_results = train_triplet_model(
    "bert-base-nli-mean-tokens",
    triplets_initial,
    "fine_tuned_bert",
    news_df_val, # Pass validation data
    behaviors_df_val, # Pass validation data
    epochs=3 # Using 3 epochs as discussed
)

✅ Model moved to GPU.


                                                                     

Step,Training Loss
500,4.0578
1000,3.3859
1500,3.2865


✅ Model Saved to 'fine_tuned_minilm'
✅ Model moved to GPU.


                                                                     

Step,Training Loss
500,0.6928
1000,0.026
1500,0.0156


✅ Model Saved to 'fine_tuned_bert'


In [None]:
# Plotting initial fine-tuning results for MiniLM
plot_training_metrics(minilm_initial_results, "MiniLM Initial Fine-tuning")

# Plotting initial fine-tuning results for BERT
plot_training_metrics(bert_initial_results, "BERT Initial Fine-tuning")

### **Fine Tuning Distilled Deepseek**

In [None]:
from transformers import BitsAndBytesConfig, TrainerCallback # Ensure imported globally

# Load DeepSeek tokenizer and model (8-bit)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Define the quantization config explicitly (using 4-bit as it's more memory efficient and common)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, # Changed from load_in_8bit=True for better memory
    bnb_4bit_quant_type="nf4", # Recommended 4-bit quantization type
    bnb_4bit_compute_dtype=torch.float16, # Recommended compute dtype for 4-bit
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config, # Pass the config object here
    device_map="auto",
    trust_remote_code=True
)

# Prepare model for training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Load and preprocess your dataset (triplet logic converted into prompt-style CLM)
# IMPORTANT: Use news_df_actual_train
news_dict_actual_train = dict(zip(news_df_actual_train['news_id'], news_df_actual_train['text']))

# Format triplets as CLM-style instruction prompts
def make_prompt(anchor, positive, negative):
    # Corrected \\n to \n for proper newlines
    return f"Anchor: {anchor}\nPositive: {positive}\nNegative: {negative}\nWhich is more similar to the Anchor?\nAnswer: Positive."

triplet_prompts = []
for _, row in behaviors_df_actual_train.iterrows(): # Use actual train behaviors_df
    if pd.isna(row['impressions']):
        continue
    impressions = row['impressions'].split()
    positives = [i.split('-')[0] for i in impressions if i.endswith('1')]
    negatives = [i.split('-')[0] for i in impressions if i.endswith('0')]
    if len(positives) < 2 or not negatives:
        continue
    for i in range(len(positives)):
        anchor_id = positives[i]
        pos_id = positives[(i + 1) % len(positives)]
        for neg_id in negatives:
            a, p, n = news_dict_actual_train.get(anchor_id), news_dict_actual_train.get(pos_id), news_dict_actual_train.get(neg_id) # Use actual train news_dict
            if a and p and n:
                triplet_prompts.append({"text": make_prompt(a, p, n)})
    # Removed or significantly increased this limit for better training
    if len(triplet_prompts) >= 20000: # Example: increased limit significantly
        break

# Convert prompts to HuggingFace Dataset
dataset = Dataset.from_list(triplet_prompts)
dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./deepseek_peft_triplet",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3, # Using 3 epochs as discussed
    logging_steps=10,
    save_steps=50,
    fp16=True,
    save_total_limit=1,
    report_to="none",
    disable_tqdm=False,
    load_best_model_at_end=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

# Attach the custom callback for DeepSeek
# Pass news_df_val and behaviors_df_val
deepseek_initial_callback = CustomValidationCallback(news_df_val, behaviors_df_val, eval_k=10)
trainer.add_callback(deepseek_initial_callback)

# Train
trainer.train()

# Save
model.save_pretrained("./deepseek_peft_triplet")
tokenizer.save_pretrained("./deepseek_peft_triplet")

print("✅ Model fine-tuned and saved to './deepseek_peft_triplet'")

# Extract results from the callback after training
deepseek_initial_results = {
    "train_losses": deepseek_initial_callback.train_losses_per_epoch,
    "val_precisions": deepseek_initial_callback.val_precisions,
    "val_recalls": deepseek_initial_callback.val_recalls,
    "val_ndcgs": deepseek_initial_callback.val_ndcgs,
    "val_maps": deepseek_initial_callback.val_maps,
    "epoch_times": deepseek_initial_callback.epoch_times # This is validation time
}

# Plotting initial fine-tuning results for DeepSeek
plot_training_metrics(deepseek_initial_results, "DeepSeek Initial Fine-tuning")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Map: 100%|██████████| 1014/1014 [00:00<00:00, 2801.07 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,4.8004
20,4.7087
30,4.5372
40,4.3855
50,4.1393
60,4.0525
70,3.9179
80,3.8111
90,3.6494
100,3.5336


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


✅ Model fine-tuned and saved to './deepseek_peft_triplet'


## **Fine Tuned LLM - Based Embeddings Generation**

In [None]:
# Updated generate_article_embeddings function
def generate_article_embeddings(news_texts, model_path, model_type):
    """
    Generates embeddings for news articles using a specified LLM.

    Args:
        news_texts (list): A list of strings, where each string is the text of a news article.
        model_path (str): The path to the fine-tuned model.
        model_type (str): The type of model to load ("sentence_transformer" or "deepseek").

    Returns:
        tuple: A tuple containing:
            - np.ndarray: A NumPy array of embeddings.
            - int: The dimensionality of the embeddings.
    """
    print(f"\nGenerating embeddings using {model_type} model from '{model_path}'...")
    if model_type == "sentence_transformer":
        model = SentenceTransformer(model_path)
        if torch.cuda.is_available():
            model.to("cuda")
        embeddings = model.encode(news_texts, show_progress_bar=True, convert_to_numpy=True)
        embedding_dim = embeddings.shape[1]
        print(f"✅ Embeddings generated. Shape: {embeddings.shape}")
        return embeddings, embedding_dim

    elif model_type == "deepseek":
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        
        # Using BitsAndBytesConfig explicitly as recommended
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=quantization_config, # Pass the config object
            device_map="auto",
            trust_remote_code=True
        )
        if torch.cuda.is_available():
            model.to("cuda")
        model.eval()

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = tokenizer.eos_token_id

        all_embeddings = []
        batch_size = 8

        for i in tqdm(range(0, len(news_texts), batch_size), desc="Generating DeepSeek Embeddings"):
            batch_texts = news_texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)
            # Corrected typo here: outputs.hidden_hidden_states -> outputs.hidden_states
            last_hidden_states = outputs.hidden_states[-1]
            attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
            masked_embeddings = last_hidden_states * attention_mask
            sum_embeddings = torch.sum(masked_embeddings, 1)
            sum_mask = torch.sum(attention_mask, 1)
            mean_pooled_embeddings = sum_embeddings / torch.clamp(sum_mask, min=1e-9)
            all_embeddings.extend(mean_pooled_embeddings.cpu().numpy())

        embeddings = np.array(all_embeddings)
        embedding_dim = embeddings.shape[1]
        print(f"✅ Embeddings generated. Shape: {embeddings.shape}")
        return embeddings, embedding_dim

    else:
        raise ValueError(f"Unsupported model_type: {model_type}. Choose 'sentence_transformer' or 'deepseek'.")


def build_annoy_index(embeddings, output_path_prefix):
    """
    Builds and saves an Annoy index from given embeddings.

    Args:
        embeddings (np.ndarray): A NumPy array of embeddings.
        output_path_prefix (str): A prefix for saving the Annoy index file and associated parquet.
    Returns:
        int: The embedding dimension used.
    """
    embedding_dim = embeddings.shape[1]
    print(f"\nBuilding Annoy index with {embedding_dim} dimensions for '{output_path_prefix}'...")
    annoy_index = AnnoyIndex(embedding_dim, 'angular')
    for i, emb in tqdm(enumerate(embeddings), total=len(embeddings), desc="Adding items to Annoy index"):
        annoy_index.add_item(i, emb)
    n_trees = 50
    annoy_index.build(n_trees)
    annoy_index_file = f'{output_path_prefix}_articles.ann'
    annoy_index.save(annoy_index_file)
    print(f"✅ Annoy index built and saved to '{annoy_index_file}'")
    return embedding_dim


def build_faiss_index_modular(embeddings, output_path_prefix):
    """
    Builds and saves a Faiss index from given embeddings.

    Args:
        embeddings (np.ndarray): A NumPy array of embeddings.
        output_path_prefix (str): A prefix for saving the Faiss index file.
    """
    faiss.normalize_L2(embeddings)
    embedding_dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(embedding_dim)
    index.add(embeddings)
    faiss_index_file = f'{output_path_prefix}_articles.faiss'
    faiss.write_index(index, faiss_index_file)
    print(f"✅ Faiss index built and saved to '{faiss_index_file}'")

# --- News texts prepared from Step 2, now from the training set ---
# IMPORTANT: This variable should correctly point to news_df_train
news_texts_for_embedding = news_df_train['text'].tolist()

### **MiniLM**

In [None]:
print("\n--- Processing for MiniLM Model ---\n")

# Define MiniLM model paths
# Ensure 'fine_tuned_minilm' is the correct path where your first-stage fine-tuned model is saved
minilm_model_path = "fine_tuned_minilm"
minilm_output_prefix = minilm_model_path.replace("./", "").replace("/", "_")

# Step 4: Generate Article Embeddings for MiniLM using the training data
minilm_embeddings, minilm_embedding_dim = generate_article_embeddings(
    news_texts_for_embedding, # This now correctly refers to news_df_train['text'].tolist()
    minilm_model_path,
    "sentence_transformer"
)
# Store embeddings in DataFrame for this model (optional, for persistence/inspection)
# IMPORTANT: Store with news_df_train
news_df_train[f'{minilm_output_prefix}_embedding'] = list(minilm_embeddings)


# Step 5: Build LSH Index (Annoy) for MiniLM using the training data embeddings
build_annoy_index(minilm_embeddings, minilm_output_prefix)
# Save updated news_df_train with MiniLM embeddings and Annoy IDs
# IMPORTANT: Use news_df_train
news_df_train['annoy_id'] = news_df_train.index # Ensure index is stored for Annoy lookup
news_df_train.to_parquet(f'{minilm_output_prefix}_articles_with_embeddings_train.parquet', engine='pyarrow', index=False)
print(f"✅ News DataFrame with {minilm_output_prefix} embeddings and Annoy IDs saved to '{minilm_output_prefix}_articles_with_embeddings_train.parquet'")


# Step 6: Build Faiss Index for MiniLM using the training data embeddings
build_faiss_index_modular(minilm_embeddings, minilm_output_prefix)

print("\n--- MiniLM Model Processing Completed ---\n")


--- Processing for MiniLM Model ---


Generating embeddings using sentence_transformer model from 'fine_tuned_minilm'...


Batches: 100%|██████████| 1267/1267 [07:13<00:00,  2.92it/s]


✅ Embeddings generated. Shape: (40535, 384)

Building Annoy index with 384 dimensions for 'fine_tuned_minilm'...


Adding items to Annoy index: 100%|██████████| 40535/40535 [00:00<00:00, 41858.90it/s]


✅ Annoy index built and saved to 'fine_tuned_minilm_articles.ann'
✅ News DataFrame with fine_tuned_minilm embeddings and Annoy IDs saved to 'fine_tuned_minilm_articles_with_embeddings_train.parquet'
✅ Faiss index built and saved to 'fine_tuned_minilm_articles.faiss'

--- MiniLM Model Processing Completed ---



### **BERT**

In [None]:
print("\n--- Processing for BERT Model ---\n")

# Define BERT model paths
# Ensure 'fine_tuned_bert' is the correct path where your first-stage fine-tuned model is saved
bert_model_path = "fine_tuned_bert"
bert_output_prefix = bert_model_path.replace("./", "").replace("/", "_")

# Step 4: Generate Article Embeddings for BERT using the training data
bert_embeddings, bert_embedding_dim = generate_article_embeddings(
    news_texts_for_embedding, # This correctly refers to news_df_train['text'].tolist()
    bert_model_path,
    "sentence_transformer"
)
# Store embeddings in DataFrame for this model (optional, for persistence/inspection)
# IMPORTANT: Store with news_df_train
news_df_train[f'{bert_output_prefix}_embedding'] = list(bert_embeddings)


# Step 5: Build LSH Index (Annoy) for BERT using the training data embeddings
build_annoy_index(bert_embeddings, bert_output_prefix)
# Save updated news_df_train with BERT embeddings and Annoy IDs
# IMPORTANT: Use news_df_train
news_df_train.to_parquet(f'{bert_output_prefix}_articles_with_embeddings_train.parquet', engine='pyarrow', index=False)
print(f"✅ News DataFrame with {bert_output_prefix} embeddings and Annoy IDs saved to '{bert_output_prefix}_articles_with_embeddings_train.parquet'")


# Step 6: Build Faiss Index for BERT using the training data embeddings
build_faiss_index_modular(bert_embeddings, bert_output_prefix)

print("\n--- BERT Model Processing Completed ---\n")


--- Processing for BERT Model ---


Generating embeddings using sentence_transformer model from 'fine_tuned_bert'...


Batches: 100%|██████████| 1267/1267 [3:06:50<00:00,  8.85s/it] 


✅ Embeddings generated. Shape: (40535, 768)

Building Annoy index with 768 dimensions for 'fine_tuned_bert'...


Adding items to Annoy index: 100%|██████████| 40535/40535 [00:01<00:00, 20590.85it/s]


✅ Annoy index built and saved to 'fine_tuned_bert_articles.ann'
✅ News DataFrame with fine_tuned_bert embeddings and Annoy IDs saved to 'fine_tuned_bert_articles_with_embeddings_train.parquet'
✅ Faiss index built and saved to 'fine_tuned_bert_articles.faiss'

--- BERT Model Processing Completed ---



### **Deepseek**

In [None]:
print("\n--- Processing for DeepSeek Model ---\n")

# Define DeepSeek model paths
# Ensure './deepseek_peft_triplet' is the correct path where your first-stage fine-tuned model is saved
deepseek_model_path = "./deepseek_peft_triplet"
deepseek_output_prefix = deepseek_model_path.replace("./", "").replace("/", "_") # Clean up path for filename

# Step 4: Generate Article Embeddings for DeepSeek using the training data
deepseek_embeddings, deepseek_embedding_dim = generate_article_embeddings(
    news_texts_for_embedding, # This correctly refers to news_df_train['text'].tolist()
    deepseek_model_path,
    "deepseek"
)
# Store embeddings in DataFrame for this model (optional, for persistence/inspection)
# IMPORTANT: Store with news_df_train
news_df_train[f'{deepseek_output_prefix}_embedding'] = list(deepseek_embeddings)


# Step 5: Build LSH Index (Annoy) for DeepSeek using the training data embeddings
build_annoy_index(deepseek_embeddings, deepseek_output_prefix)
# Save updated news_df_train with DeepSeek embeddings and Annoy IDs
# IMPORTANT: Use news_df_train
news_df_train.to_parquet(f'{deepseek_output_prefix}_articles_with_embeddings_train.parquet', engine='pyarrow', index=False)
print(f"✅ News DataFrame with {deepseek_output_prefix} embeddings and Annoy IDs saved to '{deepseek_output_prefix}_articles_with_embeddings_train.parquet'")


# Step 6: Build Faiss Index for DeepSeek using the training data embeddings
build_faiss_index_modular(deepseek_embeddings, deepseek_output_prefix)

print("\n--- DeepSeek Model Processing Completed ---\n")


--- Processing for DeepSeek Model ---


Generating embeddings using deepseek model from './deepseek_peft_triplet'...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Generating DeepSeek Embeddings: 100%|██████████| 5067/5067 [58:42<00:00,  1.44it/s]  


✅ Embeddings generated. Shape: (40535, 1536)

Building Annoy index with 1536 dimensions for 'deepseek_peft_triplet'...


Adding items to Annoy index: 100%|██████████| 40535/40535 [00:04<00:00, 9829.42it/s] 


✅ Annoy index built and saved to 'deepseek_peft_triplet_articles.ann'
✅ News DataFrame with deepseek_peft_triplet embeddings and Annoy IDs saved to 'deepseek_peft_triplet_articles_with_embeddings_train.parquet'
✅ Faiss index built and saved to 'deepseek_peft_triplet_articles.faiss'

--- DeepSeek Model Processing Completed ---



## **2.3**

In [None]:
# This is a helper function to load a Faiss index, assuming it was saved.
def load_faiss_index(index_path):
    print(f"Loading Faiss index from '{index_path}'...")
    index = faiss.read_index(index_path)
    print(f"✅ Faiss index loaded from '{index_path}'")
    return index

# Helper to load fine-tuned SentenceTransformer model
def load_sentence_transformer_model(model_path):
    model = SentenceTransformer(model_path)
    if torch.cuda.is_available():
        model.to("cuda")
    return model

# Helper to load fine-tuned DeepSeek model for embedding generation
def load_deepseek_model_for_embedding(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    # Using BitsAndBytesConfig explicitly as recommended
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True, # Changed from load_in_8bit=True
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config, # Pass the config object
        device_map="auto",
        trust_remote_code=True
    )
    if torch.cuda.is_available():
        model.to("cuda")
    model.eval()
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.eos_token_id
    return model, tokenizer

In [None]:
# Updated generate_hard_triplets
def generate_hard_triplets(
    faiss_index,
    embedding_model, # The fine-tuned S-Transformer or DeepSeek model instance
    tokenizer, # Only for DeepSeek
    news_texts_dict, # Dictionary mapping news_id to text (should be derived from news_df_train)
    behaviors_df_train, # IMPORTANT: Now explicitly taking the training behaviors_df
    num_hard_negatives_per_anchor=1,
    max_hard_triplets=5000 # Limit the number of hard triplets
):
    print(f"\nGenerating hard triplets (up to {max_hard_triplets})...")
    hard_triplets = []
    processed_users = set()

    # Create a mapping from news_id to its index in the embedding array
    # This must be based on the news_df_train used to build the Faiss index
    # Assuming news_df_train has news_id and that its current index directly maps to embedding array
    news_id_to_idx = {news_id: idx for idx, news_id in enumerate(news_df_train['news_id'].tolist())}
    idx_to_news_id = {idx: news_id for news_id, idx in news_id_to_idx.items()}


    for _, row in tqdm(behaviors_df_train.iterrows(), total=len(behaviors_df_train), desc="Mining Hard Negatives"): # Use behaviors_df_train
        user_id = row["user_id"]
        if user_id in processed_users:
            continue
        processed_users.add(user_id)

        if pd.isna(row["impressions"]): continue
        impressions = row["impressions"].split()
        pos_news_ids = [i.split('-')[0] for i in impressions if i.endswith('1')]\
        neg_news_ids = [i.split('-')[0] for i in impressions if i.endswith('0')]

        if len(pos_news_ids) < 1 or not neg_news_ids: continue

        # Use only articles that exist in our news_texts_dict (from news_df_train)
        valid_pos_news_ids = [nid for nid in pos_news_ids if nid in news_id_to_idx and news_texts_dict.get(nid)]
        valid_neg_news_ids = [nid for nid in neg_news_ids if nid in news_id_to_idx and news_texts_dict.get(nid)]

        if len(valid_pos_news_ids) < 1 or not valid_neg_news_ids: continue

        for anchor_id in valid_pos_news_ids:
            # Get anchor text and its index
            anchor_text = news_texts_dict.get(anchor_id)
            anchor_idx = news_id_to_idx.get(anchor_id)
            if not anchor_text or anchor_idx is None: continue

            # Generate embedding for the anchor
            if isinstance(embedding_model, SentenceTransformer):
                anchor_embedding = embedding_model.encode([anchor_text], convert_to_numpy=True)
            elif isinstance(embedding_model, AutoModelForCausalLM): # For DeepSeek
                inputs = tokenizer(anchor_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(embedding_model.device)
                with torch.no_grad():
                    outputs = embedding_model(**inputs, output_hidden_states=True)
                # Corrected typo here: outputs.hidden_hidden_states -> outputs.hidden_states
                last_hidden_states = outputs.hidden_states[-1]
                attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
                masked_embeddings = last_hidden_states * attention_mask
                sum_embeddings = torch.sum(masked_embeddings, 1)
                sum_mask = torch.sum(attention_mask, 1)
                anchor_embedding = (sum_embeddings / torch.clamp(sum_mask, min=1e-9)).cpu().numpy()
            else:
                continue

            # Normalize anchor embedding for IP search
            faiss.normalize_L2(anchor_embedding)

            # Search for nearest neighbors (both positive and negative) in the index
            D, I = faiss_index.search(anchor_embedding, k=min(faiss_index.ntotal, 200))

            # Filter for hard negatives: these are recommended but user didn't click
            hard_neg_candidates = []
            for j, neighbor_idx in enumerate(I[0]):
                neighbor_id = idx_to_news_id.get(neighbor_idx)
                if neighbor_id and neighbor_id in valid_neg_news_ids:
                    hard_neg_candidates.append(news_texts_dict.get(neighbor_id))

            # Create triplets with hard negatives
            if hard_neg_candidates:
                other_pos_ids = [nid for nid in valid_pos_news_ids if nid != anchor_id]
                if not other_pos_ids: continue

                positive_id = random.choice(other_pos_ids)
                positive_text = news_texts_dict.get(positive_id)
                if not positive_text: continue

                selected_hard_negatives = random.sample(hard_neg_candidates, min(len(hard_neg_candidates), num_hard_negatives_per_anchor))

                for hard_neg_text in selected_hard_negatives:
                    hard_triplets.append(InputExample(texts=[anchor_text, positive_text, hard_neg_text]))
                    if len(hard_triplets) >= max_hard_triplets:
                        print(f"✅ Reached maximum hard triplets: {max_hard_triplets}")
                        return hard_triplets

    print(f"✅ Generated {len(hard_triplets)} hard triplets.")
    return hard_triplets


In [None]:
# Updated train_second_stage_triplet_model
def train_second_stage_triplet_model(model_instance, hard_triplets, model_output_path_suffix,
                                     news_df_val, behaviors_df_val, epochs=1, warmup_steps=50, eval_k=10):
    """
    Continues fine-tuning a SentenceTransformer model using hard triplets.
    Includes logging of train/val metrics and epoch times.
    """
    model_output = f"{model_instance.cache_folder.split('/')[-1]}_{model_output_path_suffix}"
    if torch.cuda.is_available():
        model_instance.to("cuda")
        print("✅ Model moved to GPU for second stage training.")
    else:
        print("⚠️ CUDA not available. Second stage training on CPU.")

    train_loader = DataLoader(hard_triplets, shuffle=True, batch_size=8)
    loss_fn = losses.TripletLoss(model=model_instance)
    optimizer = torch.optim.AdamW(model_instance.parameters(), lr=1e-5) # Smaller LR for second stage

    train_losses = []
    val_precisions = []
    val_recalls = []
    val_ndcgs = []
    val_maps = []
    epoch_times = []

    true_relevant_items_val = get_true_relevant_items_val(behaviors_df_val)
    user_history_val = get_user_history_val(behaviors_df_val)

    print(f"\nStarting second-stage fine-tuning for {model_output}...")
    global_step = 0
    for epoch in range(epochs):
        start_time = time.time()
        model_instance.train()
        epoch_train_losses = []

        for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Second Stage Epoch {epoch+1} Training"):
            features = model_instance.tokenize(batch.texts)
            reps = model_instance(features)
            loss = loss_fn(reps)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            epoch_train_losses.append(loss.item())
            global_step += 1

            if global_step < warmup_steps:
                for pg in optimizer.param_groups:
                    pg['lr'] = (global_step / warmup_steps) * 1e-5

        avg_train_loss = np.mean(epoch_train_losses)
        train_losses.append(avg_train_loss)
        print(f"Second Stage Epoch {epoch+1} - Avg Training Loss: {avg_train_loss:.4f}")

        # --- Validation Evaluation ---
        model_instance.eval()
        with torch.no_grad():
            val_news_texts_for_embedding = news_df_val['text'].tolist()
            current_val_embeddings = model_instance.encode(val_news_texts_for_embedding, show_progress_bar=False, convert_to_numpy=True)
            val_faiss_index = faiss.IndexFlatIP(current_val_embeddings.shape[1])
            faiss.normalize_L2(current_val_embeddings)
            val_faiss_index.add(current_val_embeddings)
            
            val_recommended_items = simulate_recommendations_for_validation(
                user_history_val,
                model_instance,
                None, # Tokenizer not needed for SentenceTransformer
                val_faiss_index,
                news_df_val,
                k=eval_k
            )
            metrics = calculate_precision_recall_ndcg_map(val_recommended_items, true_relevant_items_val, k=eval_k)
            val_precisions.append(metrics[f"Precision@{eval_k}"])
            val_recalls.append(metrics[f"Recall@{eval_k}"])
            val_ndcgs.append(metrics[f"NDCG@{eval_k}"])
            val_maps.append(metrics[f"MAP@{eval_k}"])
            print(f"Second Stage Epoch {epoch+1} - Val Precision@{eval_k}: {metrics[f'Precision@{eval_k}']:.4f}, "
                  f"Val Recall@{eval_k}: {metrics[f'Recall@{eval_k}']:.4f}, "
                  f"Val NDCG@{eval_k}: {metrics[f'NDCG@{eval_k}']:.4f}, "
                  f"Val MAP@{eval_k}: {metrics[f'MAP@{eval_k}']:.4f}")

        end_time_epoch = time.time()
        epoch_times.append(end_time_epoch - start_time)
        print(f"Second Stage Epoch {epoch+1} Time: {epoch_times[-1]:.2f} seconds")

    final_output_path = model_output # Assume model_output is the full path
    model_instance.save(final_output_path)
    print(f"✅ Second-stage fine-tuned model saved to '{final_output_path}'")
    return {
        "train_losses": train_losses,
        "val_precisions": val_precisions,
        "val_recalls": val_recalls,
        "val_ndcgs": val_ndcgs,
        "val_maps": val_maps,
        "epoch_times": epoch_times
    }

In [None]:
# Updated train_second_stage_deepseek
class CustomValidationCallback(TrainerCallback):
    def __init__(self, news_df_val, behaviors_df_val, eval_k=10):
        self.news_df_val = news_df_val
        self.behaviors_df_val = behaviors_df_val
        self.eval_k = eval_k
        self.val_precisions = []
        self.val_recalls = []
        self.val_ndcgs = []
        self.val_maps = []
        self.epoch_times = [] # To store the time taken for the validation step
        self.train_losses_per_epoch = []
        self.true_relevant_items_val = get_true_relevant_items_val(behaviors_df_val)
        self.user_history_val = get_user_history_val(behaviors_df_val)
        self.news_id_to_idx_val = {news_id: idx for idx, news_id in enumerate(self.news_df_val['news_id'].tolist())}
        self.idx_to_news_id_val = {idx: news_id for news_id, idx in self.news_id_to_idx_val.items()}


    def on_epoch_end(self, args, state, control, **kwargs):
        # Calculate training loss for the epoch
        if state.log_history:
            epoch_logs = [log for log in state.log_history if 'epoch' in log and round(log['epoch']) == round(state.epoch)]
            if epoch_logs:
                epoch_loss_values = [log['loss'] for log in epoch_logs if 'loss' in log]
                if epoch_loss_values:
                    self.train_losses_per_epoch.append(np.mean(epoch_loss_values))

        start_time_val_epoch = time.time() # Start timing for this epoch's validation

        self.trainer.model.eval() # Set model to eval mode

        # Dynamically generate embeddings for news_df_val using the current model and tokenizer
        val_news_texts_for_embedding_current = self.news_df_val['text'].tolist()
        
        current_val_embeddings_list = []
        batch_size_val_inference = 8 # Batch size for validation inference
        
        # Access the model and tokenizer from the trainer
        current_deepseek_model = self.trainer.model
        current_deepseek_tokenizer = self.trainer.tokenizer

        for i in tqdm(range(0, len(val_news_texts_for_embedding_current), batch_size_val_inference), desc="Generating Val Embeddings (DeepSeek)"):
            batch_texts = val_news_texts_for_embedding_current[i:i + batch_size_val_inference]
            inputs = current_deepseek_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(current_deepseek_model.device)
            with torch.no_grad():
                outputs = current_deepseek_model(**inputs, output_hidden_states=True)
            # Corrected typo: outputs.hidden_hidden_states -> outputs.hidden_states
            last_hidden_states = outputs.hidden_states[-1]
            attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
            masked_embeddings = last_hidden_states * attention_mask
            sum_embeddings = torch.sum(masked_embeddings, 1)
            sum_mask = torch.sum(attention_mask, 1)
            mean_pooled_embeddings = sum_embeddings / torch.clamp(sum_mask, min=1e-9)
            current_val_embeddings_list.extend(mean_pooled_embeddings.cpu().numpy())
        current_val_embeddings_np = np.array(current_val_embeddings_list)


        val_faiss_index = faiss.IndexFlatIP(current_val_embeddings_np.shape[1])
        faiss.normalize_L2(current_val_embeddings_np)
        val_faiss_index.add(current_val_embeddings_np)
        
        val_recommended_items = simulate_recommendations_for_validation(
            self.user_history_val,
            current_deepseek_model, # Pass the current model instance
            current_deepseek_tokenizer, # Pass the tokenizer for DeepSeek
            val_faiss_index,
            self.news_df_val,
            k=self.eval_k
        )
        
        metrics = calculate_precision_recall_ndcg_map(
            val_recommended_items,
            self.true_relevant_items_val,
            k=self.eval_k
        )
        self.val_precisions.append(metrics[f"Precision@{self.eval_k}"])
        self.val_recalls.append(metrics[f"Recall@{self.eval_k}"])
        self.val_ndcgs.append(metrics[f"NDCG@{self.eval_k}"])
        self.val_maps.append(metrics[f"MAP@{self.eval_k}"])
        
        end_time_val_epoch = time.time()
        self.epoch_times.append(end_time_val_epoch - start_time_val_epoch)

        print(f"Epoch {int(state.epoch)} - Val Precision@{self.eval_k}: {metrics[f'Precision@{self.eval_k}']:.4f}, "
              f"Val Recall@{self.eval_k}: {metrics[f'Recall@{self.eval_k}']:.4f}, "
              f"Val NDCG@{self.eval_k}: {metrics[f'NDCG@{self.eval_k}']:.4f}, "
              f"Val MAP@{self.eval_k}: {metrics[f'MAP@{self.eval_k}']:.4f}, "
              f"Val Time: {self.epoch_times[-1]:.2f}s")
        
        control.should_save = True # Optional: force save checkpoint at epoch end

    def on_train_end(self, args, state, control, **kwargs):
        # This method is called once at the very end of training.
        # epoch_times and train_losses_per_epoch are already populated by on_epoch_end
        pass # No additional aggregation needed here if on_epoch_end is working as intended



# Updated train_second_stage_deepseek
def train_second_stage_deepseek(model_instance, tokenizer_instance, hard_triplets, model_output_path_suffix,
                                news_df_val, behaviors_df_val, epochs=1, eval_k=10):
    """
    Continues fine-tuning DeepSeek using hard triplets as prompts.
    Includes logging of train/val metrics and epoch times via a custom callback.
    """
    model_output_dir = f"./deepseek_peft_triplet_{model_output_path_suffix}"
    print(f"\nStarting second-stage fine-tuning for DeepSeek to '{model_output_dir}'...")

    def make_prompt(anchor, positive, negative):
        # Corrected \n for proper newlines
        return f"Anchor: {anchor}\nPositive: {positive}\nNegative: {negative}\nWhich is more similar to the Anchor?\nAnswer: Positive."

    triplet_prompts = []
    for example in hard_triplets:
        a, p, n = example.texts[0], example.texts[1], example.texts[2]
        triplet_prompts.append({"text": make_prompt(a, p, n)})

    dataset = Dataset.from_list(triplet_prompts)
    dataset = dataset.map(lambda x: tokenizer_instance(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    training_args = TrainingArguments(
        output_dir=model_output_dir,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=epochs, # Use the epochs parameter
        logging_steps=10,
        save_steps=50,
        fp16=True,
        save_total_limit=1,
        report_to="none",
        disable_tqdm=False,
        load_best_model_at_end=False,
    )

    trainer = Trainer(
        model=model_instance,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer_instance, mlm=False)
    )

    # Attach the custom callback for DeepSeek
    callback_instance = CustomValidationCallback(news_df_val, behaviors_df_val, eval_k=eval_k)
    trainer.add_callback(callback_instance)
    
    # Start training
    trainer.train()

    # Save final model
    model_instance.save_pretrained(model_output_dir)
    tokenizer_instance.save_pretrained(model_output_dir)
    print(f"✅ Second-stage fine-tuned DeepSeek model saved to '{model_output_dir}'")

    # Return results collected by the callback
    return {
        "train_losses": callback_instance.train_losses_per_epoch,
        "val_precisions": callback_instance.val_precisions,
        "val_recalls": callback_instance.val_recalls,
        "val_ndcgs": callback_instance.val_ndcgs,
        "val_maps": callback_instance.val_maps,
        "epoch_times": callback_instance.epoch_times # This is validation time per epoch
    }

In [None]:
# Updated MiniLM Second Stage Call
print("\n--- Starting R2.3 Step 2 for MiniLM (Hard Negative Mining) ---")

minilm_model_for_hnm = load_sentence_transformer_model("fine_tuned_minilm")
minilm_faiss_index = load_faiss_index("fine_tuned_minilm_articles.faiss")

news_dict_for_hnm = dict(zip(news_df_actual_train['news_id'], news_df_actual_train['text'])) # Use actual train news_df

minilm_hard_triplets = generate_hard_triplets(
    faiss_index=minilm_faiss_index,
    embedding_model=minilm_model_for_hnm,
    tokenizer=None,
    news_texts_dict=news_dict_for_hnm,
    behaviors_df_train=behaviors_df_actual_train, # Use actual train behaviors_df
    max_hard_triplets=5000
)

if minilm_hard_triplets:
    minilm_second_stage_results = train_second_stage_triplet_model( # Capture results
        model_instance=minilm_model_for_hnm,
        hard_triplets=minilm_hard_triplets,
        model_output_path_suffix="second_stage",
        news_df_val=news_df_val, # Pass validation data
        behaviors_df_val=behaviors_df_val, # Pass validation data
        epochs=1 # Using 1 epoch for second stage as a starting point
    )

    # Plotting second-stage results for MiniLM
    plot_training_metrics(minilm_second_stage_results, "MiniLM Second Stage Fine-tuning")

    # ... (rest of MiniLM second stage: generate embeddings and rebuild indices) ...
    second_stage_minilm_embeddings, _ = generate_article_embeddings(
        news_texts_for_embedding, # This uses news_df_train (which is news_df_actual_train + news_df_val)
        minilm_second_stage_model_path,
        "sentence_transformer"
    )
    # The `news_texts_for_embedding` here should be `news_df_actual_train['text'].tolist()`
    # if you want to generate embeddings for only the "actual_train" set for the index.
    # Otherwise, it's `news_df_train['text'].tolist()` if you want embeddings for the full initial training set.
    # For indices, usually you want embeddings for ALL articles that could potentially be recommended, so
    # news_df_train (which contains news_df_actual_train + news_df_val) is appropriate here.
    # Re-check the initial news_texts_for_embedding definition to ensure it uses news_df_train.
    # If the initial definition in `Fine Tuned LLM - Based Embeddings Generation` is `news_df_train['text'].tolist()`, it's correct.

    # Re-build Annoy and Faiss indices with the *second-stage* embeddings
    # Save these with a distinct prefix to indicate they are from the second stage
    build_annoy_index(second_stage_minilm_embeddings, "fine_tuned_minilm_second_stage")
    build_faiss_index_modular(second_stage_minilm_embeddings, "fine_tuned_minilm_second_stage")
    print("--- R2.3 Step 2 for MiniLM Completed ---")
else:
    print("No hard triplets found for MiniLM. Skipping second-stage fine-tuning.")


--- Starting R2.3 Step 2 for MiniLM (Hard Negative Mining) ---
Loading Faiss index from 'fine_tuned_minilm_articles.faiss'...
✅ Faiss index loade|d from 'fine_tuned_minilm_articles.faiss'

Generating hard triplets (up to 5000)...


Mining Hard Negatives:  58%|█████▊    | 72623/125634 [07:47<05:41, 155.39it/s]


✅ Reached maximum hard triplets: 5000
✅ Model moved to GPU for second stage training.

Starting second-stage fine-tuning for fine_tuned_minilm_second_stage...


                                                                     

Step,Training Loss
500,4.9014


✅ Second-stage fine-tuned model saved to 'fine_tuned_minilm_second_stage'

Generating embeddings from second-stage fine-tuned MiniLM model: fine_tuned_minilm_second_stage

Generating embeddings using sentence_transformer model from 'fine_tuned_minilm_second_stage'...


Batches: 100%|██████████| 1267/1267 [00:26<00:00, 47.77it/s] 


✅ Embeddings generated. Shape: (40535, 384)

Building Annoy index with 384 dimensions for 'fine_tuned_minilm_second_stage'...


Adding items to Annoy index: 100%|██████████| 40535/40535 [00:00<00:00, 41595.10it/s]


✅ Annoy index built and saved to 'fine_tuned_minilm_second_stage_articles.ann'
✅ Faiss index built and saved to 'fine_tuned_minilm_second_stage_articles.faiss'
--- R2.3 Step 2 for MiniLM Completed ---


In [None]:
# Updated BERT Second Stage Call
print("\n--- Starting R2.3 Step 2 for BERT (Hard Negative Mining) ---")

bert_model_for_hnm = load_sentence_transformer_model("fine_tuned_bert")
bert_faiss_index = load_faiss_index("fine_tuned_bert_articles.faiss")

news_dict_for_hnm = dict(zip(news_df_actual_train['news_id'], news_df_actual_train['text']))

bert_hard_triplets = generate_hard_triplets(
    faiss_index=bert_faiss_index,
    embedding_model=bert_model_for_hnm,
    tokenizer=None,
    news_texts_dict=news_dict_for_hnm,
    behaviors_df_train=behaviors_df_actual_train,
    max_hard_triplets=5000
)

if bert_hard_triplets:
    bert_second_stage_results = train_second_stage_triplet_model( # Capture results
        model_instance=bert_model_for_hnm,
        hard_triplets=bert_hard_triplets,
        model_output_path_suffix="second_stage",
        news_df_val=news_df_val,
        behaviors_df_val=behaviors_df_val,
        epochs=1
    )

    # Plotting second-stage results for BERT
    plot_training_metrics(bert_second_stage_results, "BERT Second Stage Fine-tuning")

    # ... (rest of BERT second stage: generate embeddings and rebuild indices) ...

    # After second stage, generate new embeddings from the *second-stage fine-tuned model*
    print(f"\nGenerating embeddings from second-stage fine-tuned BERT model: {bert_second_stage_model_path}")
    second_stage_bert_embeddings, _ = generate_article_embeddings(
        news_texts_for_embedding,
        bert_second_stage_model_path,
        "sentence_transformer"
    )

    # Re-build Annoy and Faiss indices with the *second-stage* embeddings
    build_annoy_index(second_stage_bert_embeddings, "fine_tuned_bert_second_stage")
    build_faiss_index_modular(second_stage_bert_embeddings, "fine_tuned_bert_second_stage")
    print("--- R2.3 Step 2 for BERT Completed ---")
else:
    print("No hard triplets found for BERT. Skipping second-stage fine-tuning.")


--- Starting R2.3 Step 2 for BERT (Hard Negative Mining) ---
Loading Faiss index from 'fine_tuned_bert_articles.faiss'...
✅ Faiss index loade|d from 'fine_tuned_bert_articles.faiss'

Generating hard triplets (up to 5000)...


Mining Hard Negatives:  45%|████▌     | 56891/125634 [10:50<13:05, 87.48it/s] 


✅ Reached maximum hard triplets: 5000
✅ Model moved to GPU for second stage training.

Starting second-stage fine-tuning for fine_tuned_minilm_second_stage...


                                                                     

Step,Training Loss
500,5.2747


✅ Second-stage fine-tuned model saved to 'fine_tuned_minilm_second_stage'

Generating embeddings from second-stage fine-tuned BERT model: fine_tuned_minilm_second_stage

Generating embeddings using sentence_transformer model from 'fine_tuned_minilm_second_stage'...


Batches: 100%|██████████| 1267/1267 [02:20<00:00,  9.01it/s]


✅ Embeddings generated. Shape: (40535, 768)

Building Annoy index with 768 dimensions for 'fine_tuned_bert_second_stage'...


Adding items to Annoy index: 100%|██████████| 40535/40535 [00:02<00:00, 18966.05it/s]


✅ Annoy index built and saved to 'fine_tuned_bert_second_stage_articles.ann'
✅ Faiss index built and saved to 'fine_tuned_bert_second_stage_articles.faiss'
--- R2.3 Step 2 for BERT Completed ---


In [None]:
# Updated DeepSeek Second Stage Call
print("\n--- Starting R2.3 Step 2 for DeepSeek (Hard Negative Mining) ---")

deepseek_model_for_hnm, deepseek_tokenizer_for_hnm = load_deepseek_model_for_embedding("./deepseek_peft_triplet")
deepseek_faiss_index = load_faiss_index("deepseek_peft_triplet_articles.faiss")

news_dict_for_hnm = dict(zip(news_df_actual_train['news_id'], news_df_actual_train['text']))

deepseek_hard_triplets_input_examples = generate_hard_triplets(
    faiss_index=deepseek_faiss_index,
    embedding_model=deepseek_model_for_hnm,
    tokenizer=deepseek_tokenizer_for_hnm,
    news_texts_dict=news_dict_for_hnm,
    behaviors_df_train=behaviors_df_actual_train,
    max_hard_triplets=2000
)

if deepseek_hard_triplets_input_examples:
    deepseek_second_stage_results = train_second_stage_deepseek( # Capture results
        model_instance=deepseek_model_for_hnm,
        tokenizer_instance=deepseek_tokenizer_for_hnm,
        hard_triplets=deepseek_hard_triplets_input_examples,
        model_output_path_suffix="second_stage",
        news_df_val=news_df_val, # Pass validation data
        behaviors_df_val=behaviors_df_val, # Pass validation data
        epochs=1 # Using 1 epoch for second stage as a starting point
    )

    # Plotting second-stage results for DeepSeek
    plot_training_metrics(deepseek_second_stage_results, "DeepSeek Second Stage Fine-tuning")

    # ... (rest of DeepSeek second stage: generate embeddings and rebuild indices) ...

    # After second stage, generate new embeddings from the *second-stage fine-tuned model*
    print(f"\nGenerating embeddings from second-stage fine-tuned DeepSeek model: {second_stage_deepseek_model_path}")
    second_stage_deepseek_embeddings, _ = generate_article_embeddings(
        news_texts_for_embedding, # This already refers to news_df_train text
        second_stage_deepseek_model_path, # Use the path to the second-stage model
        "deepseek"
    )

    # Re-build Annoy and Faiss indices with the *second-stage* embeddings
    build_annoy_index(second_stage_deepseek_embeddings, "deepseek_peft_triplet_second_stage")
    build_faiss_index_modular(second_stage_deepseek_embeddings, "deepseek_peft_triplet_second_stage")
    print("--- R2.3 Step 2 for DeepSeek Completed ---")
else:
    print("No hard triplets found for DeepSeek. Skipping second-stage fine-tuning.")


--- Starting R2.3 Step 2 for DeepSeek (Hard Negative Mining) ---


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading Faiss index from 'deepseek_peft_triplet_articles.faiss'...
✅ Faiss index loade|d from 'deepseek_peft_triplet_articles.faiss'

Generating hard triplets (up to 2000)...


Mining Hard Negatives: 100%|██████████| 125634/125634 [00:04<00:00, 25214.62it/s]

✅ Generated 0 hard triplets.
No hard triplets found for DeepSeek. Skipping second-stage fine-tuning.





## **Add Embeddings Visualization**

In [None]:
from sklearn.manifold import TSNE # Ensure imported globally
from sklearn.decomposition import PCA # Ensure imported globally

print("\n--- Embeddings Visualization (t-SNE/PCA) ---")

# Choose one set of embeddings to visualize (e.g., from the final DeepSeek model)
# You might want to load the final DeepSeek embeddings if they were saved,
# or use the `second_stage_deepseek_embeddings` variable if it's still in scope.
# For demonstration, let's assume `second_stage_deepseek_embeddings` holds the final embeddings
# and `news_df_train` is the corresponding DataFrame.

# Ensure the embeddings are numpy arrays
if isinstance(second_stage_deepseek_embeddings, list):
    embeddings_to_visualize = np.array(second_stage_deepseek_embeddings)
else:
    embeddings_to_visualize = second_stage_deepseek_embeddings

# Subsample for faster visualization if dataset is large
n_samples = min(2000, embeddings_to_visualize.shape[0])
sample_indices = np.random.choice(embeddings_to_visualize.shape[0], n_samples, replace=False)
sampled_embeddings = embeddings_to_visualize[sample_indices]
sampled_news_df = news_df_train.iloc[sample_indices].reset_index(drop=True)

# 1. PCA for quick linear dimensionality reduction
pca = PCA(n_components=2, random_state=42)
pca_components = pca.fit_transform(sampled_embeddings)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(pca_components[:, 0], pca_components[:, 1],
                      c=pd.factorize(sampled_news_df['category'])[0],
                      cmap='Spectral', s=10, alpha=0.6)
plt.colorbar(scatter, ticks=range(len(sampled_news_df['category'].unique())),
             label='Category', format=plt.FuncFormatter(lambda i, *args: sampled_news_df['category'].unique()[int(i)]))
plt.title('Article Embeddings Visualization (PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.show()

# 2. t-SNE for non-linear dimensionality reduction (can be slow for many points)
# If you have many categories, it might be hard to distinguish in the plot
if n_samples > 50: # t-SNE works best on smaller sets
    print("\nApplying t-SNE (this might take a while)...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    tsne_components = tsne.fit_transform(sampled_embeddings)

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(tsne_components[:, 0], tsne_components[:, 1],
                          c=pd.factorize(sampled_news_df['category'])[0],
                          cmap='Spectral', s=10, alpha=0.6)
    plt.colorbar(scatter, ticks=range(len(sampled_news_df['category'].unique())),
                 label='Category', format=plt.FuncFormatter(lambda i, *args: sampled_news_df['category'].unique()[int(i)]))
    plt.title('Article Embeddings Visualization (t-SNE)')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.grid(True)
    plt.show()
else:
    print("Skipping t-SNE visualization due to small sample size.")

print("--- Embeddings Visualization Completed ---")

## **RAG Implementatoin**

In [11]:
def build_faiss_index(embeddings):
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index


In [12]:
def retrieve_docs(query, model, index, corpus, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    _, indices = index.search(query_embedding, top_k)
    return [corpus[i] for i in indices[0]]


In [13]:
def build_rag_prompt(query, docs):
    context = "\n\n".join(docs)
    return f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"


In [14]:
def generate_answer(prompt, model_path, max_tokens=150):
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to("cuda")
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [15]:
# Build once
texts = news_df["text"].tolist()
embedding_model = SentenceTransformer("fine_tuned_mind_model", device="cuda")
embeddings = embedding_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
index = build_faiss_index(embeddings)

# Query example[
query = "What happened in the recent elections?"

# RAG-style call
docs = retrieve_docs(query, embedding_model, index, texts, top_k=5)
prompt = build_rag_prompt(query, docs)
answer = generate_answer(prompt, model_path="./deepseek_peft_triplet")
print(answer)


Batches:   2%|▏         | 33/1584 [00:46<36:06,  1.40s/it]


KeyboardInterrupt: 