In this notebook, we will train a custom neural network to generate item text embeddings using a contrastive learning framework. This approach will enable us to obtain custom item text embeddings of a specified dimension.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.model_selection import train_test_split
from google.colab import drive
import os
import matplotlib.pyplot as plt

from transformers import AutoModel, AutoTokenizer
import random
import re
from tqdm import tqdm
import pickle

In [None]:
EMB_DIM = 64
BATCH_SIZE = 1024 #256
EPOCHS = 300
TEMPERATURE = 0.07  # NT-Xent loss temperature
LR = 1e-3 #1e-4
MODEL_NAME = "distilbert-base-uncased"  # Transformer model for encoding text

In [None]:
drive.mount('/content/drive')

In [None]:
project_dir = '/content/drive/MyDrive/ML/Reinforcement Learning/Final project/MIND'

In [None]:
news_train_path = os.path.join(project_dir, 'MINDsmall_train/news.tsv')
news_train = pd.read_csv(news_train_path, sep='\t', header=None, names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

news_dev_path = os.path.join(project_dir, 'MINDsmall_dev/news.tsv')
news_dev = pd.read_csv(news_dev_path, sep='\t', header=None, names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

In [None]:
behaviors_train_path = os.path.join(project_dir, 'processed_data/merged_behaviors_train.csv')
behaviors_train = pd.read_csv(behaviors_train_path)

behaviors_dev_path = os.path.join(project_dir, 'processed_data/merged_behaviors_dev.csv')
behaviors_dev = pd.read_csv(behaviors_dev_path)

In [None]:
# check if any users are found in both train and dev sets

len(set(behaviors_train['user_id']).intersection(set(behaviors_dev['user_id'])))

In [None]:
# concatenate train and dev sets into a single dataframe
# remove user_id duplicates
# filter out users with histories consisting of less than 2 items

behaviors_df = pd.concat([behaviors_train, behaviors_dev], ignore_index=True, sort=False)
behaviors_df.drop_duplicates(subset='user_id', inplace=True, ignore_index=True)
behaviors_df = behaviors_df[behaviors_df['history'].apply(lambda x: len(x.split()))>1].reset_index(drop=True)

In [None]:
def process_behaviors(row):
    # Get item IDs of clicked impressions (flag == 1)
    clicked_impressions = " ".join(i.split('-')[0] for i in row['impressions'].split() if int(i.split('-')[1]) == 1)

    # Get item IDs of non-clicked impressions (flag == 0)
    not_clicked_impressions = " ".join(i.split('-')[0] for i in row['impressions'].split() if int(i.split('-')[1]) == 0)

    # Append clicked items to the user's history
    row['history'] += " " + clicked_impressions

    # Update impressions to only include non-clicked items
    row['impressions'] = not_clicked_impressions

    return row

In [None]:
behaviors_df = behaviors_df.apply(process_behaviors, axis=1)

In [None]:
behaviors_df = behaviors_df[behaviors_df['impressions'].apply(lambda x: len(x.split()))>0].reset_index(drop=True)

In [None]:
interactions = behaviors_df['history'].apply(lambda x: x.split()).values
negative_interactions = behaviors_df['impressions'].apply(lambda x: x.split()).values

In [None]:
# concatenate train and dev news sets into a single dataframe
# remove news_id duplicates

news_df = pd.concat([news_train, news_dev], ignore_index=True, sort=False)
news_df.drop_duplicates(subset='news_id', inplace=True, ignore_index=True)

In [None]:
def preprocess_subcategory(category, subcategory):

    if '-' in subcategory or '_' in subcategory:
        subcategory = subcategory.replace('-', ' ').replace('_', ' ')
        return subcategory

    if category.lower() in subcategory.lower():
        return ''
    else:
        return subcategory

In [None]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
news_df['proc_subcategory'] = news_df.apply(lambda row: preprocess_subcategory(row['category'], row['subcategory']), axis=1)

In [None]:
news_df["text"] = news_df["category"] + " "\
+ news_df["proc_subcategory"] + " "\
+ news_df["title"] + " "\
+ news_df["abstract"].fillna('')

In [None]:
news_df["text"] = news_df["text"].apply(preprocess_text)

In [None]:
article_texts = news_df.set_index('news_id')['text'].to_dict()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_transformer = AutoModel.from_pretrained(MODEL_NAME)
model_transformer.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_transformer.to(device)

precomputed_embeddings = {}

In [None]:
for article_id, text in tqdm(article_texts.items(), total=len(article_texts)):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        embedding = model_transformer(**inputs).last_hidden_state[:, 0, :].squeeze(0)
    precomputed_embeddings[article_id] = embedding.cpu()  # store on CPU for later use

In [None]:
# embeddings_path = os.path.join(project_dir, "precomputed_distilbert_embeddings.pkl")

# with open(embeddings_path, "wb") as f:
#     pickle.dump(precomputed_embeddings, f)

In [None]:
# embeddings_path = os.path.join(project_dir, "precomputed_distilbert_embeddings.pkl")

# with open(embeddings_path, "rb") as f:
#     precomputed_embeddings = pickle.load(f)

In [None]:
hidden_size = precomputed_embeddings[next(iter(precomputed_embeddings))].shape[0]

A custom dataset class samples triplets—anchor, positive, and negative examples—from user interactions. For each instance, it randomly selects two items from positive interactions (anchor and positive) and one from negative interactions, using precomputed embeddings.

In [None]:
class ContrastiveDataset(Dataset):
    def __init__(self, interactions, negative_interactions, precomputed_embeddings):
        self.precomputed_embeddings = precomputed_embeddings
        self.samples = []

        for pos_history, neg_history in zip(interactions, negative_interactions):
            anchor, positive = random.sample(pos_history, 2)
            neg_article = random.choice(neg_history)
            self.samples.append((anchor, positive, neg_article))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        anchor, positive, negative = self.samples[idx]
        return (
            self.precomputed_embeddings[anchor],
            self.precomputed_embeddings[positive],
            self.precomputed_embeddings[negative]
        )

In [None]:
indices = np.arange(len(interactions))
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)

train_pos = interactions[train_idx]
val_pos = interactions[val_idx]
train_neg = negative_interactions[train_idx]
val_neg = negative_interactions[val_idx]

In [None]:
train_dataset = ContrastiveDataset(train_pos, train_neg, precomputed_embeddings)
val_dataset = ContrastiveDataset(val_pos, val_neg, precomputed_embeddings)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
for batch_idx, (pos1, pos2, neg) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print("Positive article 1:", pos1[0])
    print("Positive article 2:", pos2[0])
    print("Negative article:", neg[0])
    break

A neural network (TextEncoder) transforms high-dimensional embeddings (e.g., from distilbert-base-uncased) into a lower-dimensional space. It uses two fully connected layers with ReLU activation and dropout to produce compact item embeddings.

In [None]:
class TextEncoder(nn.Module):
    def __init__(self, hidden_size, emb_dim=64, dropout_prob=0.5):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_size // 2, emb_dim)

    def forward(self, embedding_batch):
        x = self.fc1(embedding_batch)
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc2(x)

The NT-Xent loss function computes cosine similarities between the anchor-positive and anchor-negative pairs. It scales these similarities using a temperature parameter and applies binary cross-entropy loss to encourage the model to pull similar items together while pushing dissimilar ones apart.

In [None]:
class NTXentLoss(nn.Module):
    def __init__(self, temperature=TEMPERATURE):
        super().__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, pos1, pos2, neg):
        pos_sim = self.cosine_similarity(pos1, pos2) / self.temperature
        neg_sim = self.cosine_similarity(pos1, neg) / self.temperature

        logits = torch.cat([pos_sim, neg_sim], dim=0)
        labels = torch.cat([torch.ones_like(pos_sim), torch.zeros_like(neg_sim)], dim=0)

        return nn.functional.binary_cross_entropy_with_logits(logits, labels)

In [None]:
model = TextEncoder(hidden_size, EMB_DIM)
loss_fn = NTXentLoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
loss_fn.to(device)

In [None]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [None]:
best_val_loss = float('inf')
early_stopping_counter = 0
EARLY_STOPPING_PATIENCE = 5

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    train_progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} - Training", leave=False)

    for text1, text2, text_neg in train_progress:

        optimizer.zero_grad()

        text1 = text1.to(device)
        text2 = text2.to(device)
        text_neg = text_neg.to(device)

        emb1 = model(text1)
        emb2 = model(text2)
        emb_neg = model(text_neg)

        loss = loss_fn(emb1, emb2, emb_neg)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_progress.set_postfix(loss=loss.item())

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        val_progress = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} - Validation", leave=False)

        for text1, text2, text_neg in val_progress:

            text1 = text1.to(device)
            text2 = text2.to(device)
            text_neg = text_neg.to(device)

            emb1 = model(text1)
            emb2 = model(text2)
            emb_neg = model(text_neg)

            loss_val = loss_fn(emb1, emb2, emb_neg)
            val_loss += loss_val.item()
            val_progress.set_postfix(loss=loss_val.item())

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    # Check for improvement on validation loss and save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), os.path.join(project_dir, 'text_encoder_distilbest_model.pt'))
        print("New best model saved!")
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    scheduler.step(avg_val_loss)

In [None]:
epochs = list(range(1, len(train_losses) + 1))

plt.figure(figsize=(10, 6))

plt.plot(epochs, train_losses, marker='o', linestyle='-', color='blue', label='Train Loss')
plt.plot(epochs, val_losses, marker='o', linestyle='-', color='red', label='Val Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train and Val Losses Over Epochs')

plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
best_model = TextEncoder(768, 64)
best_model.load_state_dict(torch.load(os.path.join(project_dir, 'text_encoder_distilbest_model.pt')))
best_model.to(device)
best_model.eval()

article_embeddings = {}

with torch.no_grad():
    for news_id, text in precomputed_embeddings.items():
        text_tensor = text.unsqueeze(0).to(device)
        embedding = best_model(text_tensor)
        article_embeddings[news_id] = embedding.squeeze(0).cpu().numpy()

In [None]:
print(f"Min value: {np.min(np.concatenate(list(article_embeddings.values())))}")
print(f"Max value: {np.max(np.concatenate(list(article_embeddings.values())))}")

# Might need normalization later

In [None]:
# embeddings_path = os.path.join(project_dir, "all_news_custom_embeddings.pkl")

# with open(embeddings_path, "wb") as f:
#     pickle.dump(article_embeddings, f)

In [None]:
embeddings_path = os.path.join(project_dir, "all_news_custom_embeddings.pkl")

with open(embeddings_path, "rb") as f:
    article_embeddings = pickle.load(f)

Now split the custom embeddings into train and dev and save them for later use.

In [None]:
train_ids = set(news_train["news_id"])
dev_ids = set(news_dev["news_id"])

In [None]:
train_custom_embeddings = {news_id: emb
                           for news_id, emb in article_embeddings.items()
                           if news_id in train_ids}

test_custom_embeddings = {news_id: emb
                          for news_id, emb in article_embeddings.items()
                          if news_id in dev_ids}

In [None]:
# train_embeddings_path = os.path.join(project_dir, 'embeddings/news_train_custom_embeddings.pkl')
# dev_embeddings_path = os.path.join(project_dir, 'embeddings/news_dev_custom_embeddings.pkl')

# with open(train_embeddings_path, "wb") as f_train:
#     pickle.dump(train_custom_embeddings, f_train)

# with open(dev_embeddings_path, "wb") as f_test:
#     pickle.dump(test_custom_embeddings, f_test)