In this notebook, we will generate item embeddings from text data that will later serve as representations of user states and actions in our reward simulator and environment.

First, we will examine the text data by analyzing it, performing text preprocessing, and training a custom fastText model.

Second, we will explore pretrained Sentence-BERT embeddings.

Third, we will leverage combined title and abstract entity embeddings provided as part of the MIND dataset.

All embeddings will be saved as lookup tables for later use.

In [None]:
import os
import pandas as pd
import numpy as np
import random
import pickle
from google.colab import drive
from collections import Counter
import json

from sklearn.metrics import classification_report, log_loss, silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import LabelEncoder

from yellowbrick.cluster import KElbowVisualizer
from gensim.models import FastText

import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [None]:
drive.mount('/content/drive')

In [None]:
project_dir = '/content/drive/MyDrive/ML/Reinforcement Learning/Final project/MIND'

In [None]:
behaviors_train_path = os.path.join(project_dir, 'MINDsmall_train/behaviors.tsv')

behaviors_train = pd.read_csv(behaviors_train_path, sep='\t', header=None, names=["impression_id", "user_id", "time", "history", "impressions"])

In [None]:
news_train_path = os.path.join(project_dir, 'MINDsmall_train/news.tsv')

news_train = pd.read_csv(news_train_path, sep='\t', header=None, names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

In [None]:
behaviors_train.head()

In [None]:
news_train.head()

In [None]:
num_articles = news_train['news_id'].nunique()
print(f"Number of unique articles: {num_articles}")

## Look at the text data

In [None]:
# Check for NaN values in the four columns that would comprise a single text field, from which embeddings would be generated

news_train[["category", "subcategory", "title", "abstract"]].isna().sum()

In [None]:
# Replace NaN values in the abstract column with empty strings

news_train["abstract"] = news_train["abstract"].fillna("")
(news_train[["abstract"]] == "").sum()

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(data=news_train, x="category", hue="category", order=news_train["category"].value_counts().index, palette="tab20")

plt.xlabel("Category", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Distribution of News Categories", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.5)

plt.show()

In [None]:
category_palette = dict(zip(news_train["category"].unique(), sns.color_palette("tab20", n_colors=news_train["category"].nunique())))

plt.figure(figsize=(14, 77))
sns.countplot(
    data=news_train,
    y="subcategory",
    order=news_train["subcategory"].value_counts().index,
    hue="category",
    palette=category_palette,
    dodge=False,
    legend=False
)

plt.xlabel("Count", fontsize=12)
plt.ylabel("Subcategory", fontsize=12)
plt.title("Distribution of News Subcategories (Colored by Category)", fontsize=14)
plt.grid(axis="x", linestyle="--", alpha=0.5)

plt.show()

In [None]:
news_train['len_title'] = news_train['title'].apply(lambda x: len(x))
news_train['len_abstract'] = news_train['abstract'].apply(lambda x: len(x))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.histplot(news_train['len_title'], bins=50, kde=True, ax=axes[0])
axes[0].set_title("Title Length")
axes[0].set_xlabel("Num Char")

sns.histplot(news_train[news_train['len_abstract'] > 0]['len_abstract'], bins=50, kde=True, ax=axes[1]) # Filter out samples with no abstracts
axes[1].set_title("Abstract Length")
axes[1].set_xlabel("Num Char")

plt.tight_layout()
plt.show()

## Embedding text data

Given that our dataset contains many distinct categories and an extensive number of subcategories—both of which are highly imbalanced—we will merge the "category", "subcategory", "title", and "abstract" columns into a single text feature. Additionally, we will further process the subcategory column.

In [None]:
def preprocess_subcategory(category, subcategory):

    if '-' in subcategory or '_' in subcategory:
        subcategory = subcategory.replace('-', ' ').replace('_', ' ')
        return subcategory

    if category.lower() in subcategory.lower():
        return ''
    else:
        return subcategory

In [None]:
news_train['proc_subcategory'] = news_train.apply(lambda row: preprocess_subcategory(row['category'], row['subcategory']), axis=1)

In [None]:
news_train["text"] = news_train["category"] + " "\
+ news_train["proc_subcategory"] + " "\
+ news_train["title"] + " "\
+ news_train["abstract"]

### Approach 1: Train a custom FastText model with embedding size = 64

In [None]:
stop_words = set(stopwords.words('english')) | set(ENGLISH_STOP_WORDS) | {"n't", "'ll", "'m", "'re", "'s", "'ve", "'d", "'t", "wo", "ca"}

In [None]:
len(stop_words)

In [None]:
def preprocess_text(text):

    text = text.lower()

    # Replace multiple whitespace characters with a single space.
    text = re.sub(r'\s+', ' ', text)

    # Remove standalone numbers from the text.
    text = re.sub(r'\b\d+\b', '', text)

    # Remove punctuation from the text.
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text into individual words.
    tokens = word_tokenize(text)

    # Remove stop words from the tokens.
    tokens = [word for word in tokens if word not in stop_words]

    # Remove duplicate tokens by converting the list to a set, then back to a list.
    tokens = list(set(tokens))

    return tokens

In [None]:
news_train['clean_text'] = news_train['text'].apply(preprocess_text)

In [None]:
# Take a look at a single example

print(f"Before processing: {news_train['text'].iloc[0]}")
print(f"After processing: {news_train['clean_text'].iloc[0]}")

In [None]:
news_train['len_tokens'] = news_train['clean_text'].apply(lambda x: len(x))

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(news_train['len_tokens'], bins=50, kde=True)
plt.xlabel("Num Tokens")
plt.title("Tokens Length after Text Cleaning")
plt.show()

In [None]:
ft_model = FastText(vector_size=64, window=3, min_count=3)
ft_model.build_vocab(corpus_iterable = news_train['clean_text'])

In [None]:
# Let's take a closer look at vocab

vocab = ft_model.wv
vocabulary_words = list(vocab.key_to_index.keys())

In [None]:
len(vocabulary_words)

In [None]:
vocabulary_words[:10]

In [None]:
# Train the model

ft_model.train(corpus_iterable=news_train['clean_text'], total_examples=len(news_train), epochs=10)

In [None]:
# Save and then load the model if you need it. But we will save the embeddings as look-up tables later on.

# ft_model.save(os.path.join(project_dir, 'fasttext_model.bin'))
# ft_model = FastText.load(os.path.join(project_dir, 'fasttext_model.bin'))

In [None]:
# Let's perform a couple of sanity checks. We expect the model to correctly compute similarity.

ft_model.wv.most_similar(positive=['politics'])

In [None]:
ft_model.wv.most_similar(positive=['football'])

In [None]:
ft_model.wv.most_similar(positive=['health'])

In [None]:
ft_model.wv.most_similar(positive=['life'])

In [None]:
ft_model.wv.most_similar(positive=['royal'])

In [None]:
ft_model.wv.most_similar(positive=['trump'])

In [None]:
# Define a function to compute the mean embedding for an article's tokens using the given model

def get_mean_embedding(tokens, model):
    return np.mean([model.wv[token] for token in tokens], axis=0)

In [None]:
news_train['fasttext_embedding'] = news_train['clean_text'].apply(lambda x: get_mean_embedding(x, ft_model))

In [None]:
news_train.head()

Let's estimate the quality of the resulting embeddings.

In [None]:
fasttext_embeddings = np.vstack(news_train['fasttext_embedding'].values)

In [None]:
model = KMeans(init = 'k-means++', n_init=10, random_state=97)
visualizer = KElbowVisualizer(model, k = (1, 20))
visualizer.fit(fasttext_embeddings)
visualizer.show()

In [None]:
n_clusters = 5

kmeans_model = KMeans(n_init=10, n_clusters=n_clusters, random_state=97)
kmeans_model.fit(fasttext_embeddings)

In [None]:
silhouette_sc = silhouette_score(fasttext_embeddings, kmeans_model.labels_)
davies_bouldin_sc = davies_bouldin_score(fasttext_embeddings, kmeans_model.labels_)
calinski_harabasz_sc = calinski_harabasz_score(fasttext_embeddings, kmeans_model.labels_)

In [None]:
print(f'Silhouette Score: {silhouette_sc}')
print(f'Davies-Bouldin Index: {davies_bouldin_sc}')
print(f'Calinski-Harabasz Index: {calinski_harabasz_sc}')

In [None]:
tsne = TSNE(n_components = 2, random_state = 88)
X_tsne = tsne.fit_transform(fasttext_embeddings)

In [None]:
df_tsne = pd.DataFrame(X_tsne, columns = ['Dim1', 'Dim2'])

plt.figure(figsize = (10, 8))
plt.scatter(df_tsne['Dim1'], df_tsne['Dim2'], c = 'grey', s = 10, alpha = 0.5)
plt.title('t-SNE visualization of fasttext embeddings')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()

In [None]:
df_tsne = pd.DataFrame(X_tsne, columns = ['Dim1', 'Dim2'])
df_tsne['category'] = news_train['category'].values

label_encoder = LabelEncoder()
df_tsne['category_encoded'] = label_encoder.fit_transform(df_tsne['category'])

plt.figure(figsize = (10, 8))
plt.scatter(df_tsne['Dim1'], df_tsne['Dim2'], c=df_tsne['category_encoded'], cmap='tab20', s=10, alpha=0.5)
plt.title('t-SNE visualization of fasttext embeddings using category labels')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()

We can see that the model is able to clearly distinguish some of the news categories and also identifies several well-defined clusters that could represent subcategories.

In [None]:
# Save to a pickle file

# embeddings_path = os.path.join(project_dir, 'news_train_fasttext_embeddings.pkl')

# with open(embeddings_path, 'wb') as f:
#     pickle.dump(news_train[['news_id', 'fasttext_embedding']], f)

In [None]:
# Load from pickle to check if it works
# with open(embeddings_path, 'rb') as f:
#     ft_emb = pickle.load(f)

Generate the same embeddings for news dev dataset.

In [None]:
news_dev_path = os.path.join(project_dir, 'MINDsmall_dev/news.tsv')

news_dev = pd.read_csv(news_dev_path, sep='\t', header=None, names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

In [None]:
news_dev["abstract"] = news_dev["abstract"].fillna("")
news_dev['proc_subcategory'] = news_dev.apply(lambda row: preprocess_subcategory(row['category'], row['subcategory']), axis=1)

news_dev["text"] = news_dev["category"] + " "\
+ news_dev["proc_subcategory"] + " "\
+ news_dev["title"] + " "\
+ news_dev["abstract"]

news_dev['clean_text'] = news_dev['text'].apply(preprocess_text)
news_dev['fasttext_embedding'] = news_dev['clean_text'].apply(lambda x: get_mean_embedding(x, ft_model))

In [None]:
# embeddings_path = os.path.join(project_dir, 'news_dev_fasttext_embeddings.pkl')

# with open(embeddings_path, 'wb') as f:
#     pickle.dump(news_dev[['news_id', 'fasttext_embedding']], f)

In [None]:
# Load from pickle to check if it works
# with open(embeddings_path, 'rb') as f:
#     ft_emb = pickle.load(f)

### Approach 2: Use pre-trained SentenceBERT embeddings to get rich text representations

In [None]:
# Load BERT model and tokenizer
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Compute BERT embeddings for the input text
def get_bert_embedding(text, tokenizer, model):
    with torch.no_grad():
        # Tokenize and encode the text with padding and truncation
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        # Move tensors to the appropriate device
        inputs = {key: val.to(device) for key, val in inputs.items()}
        # Pass inputs through the model to obtain outputs
        outputs = model(**inputs)
        # Extract and return the [CLS] token embedding as a NumPy array
        return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

In [None]:
embeddings = []
for text in tqdm(news_train["text"].fillna(""), desc="Generating BERT embeddings"):
    embedding = get_bert_embedding(text, tokenizer, model)
    embeddings.append(embedding)

In [None]:
news_train["bert_embedding"] = embeddings

# embeddings_path = os.path.join(project_dir, "news_train_bert_embeddings.pkl")

# with open(embeddings_path, "wb") as f:
#     pickle.dump(news_train[["news_id", "bert_embedding"]], f)

In [None]:
news_train.head()

In [None]:
np.max(embeddings[0]) # Normalization might be needed later.

Generate the same embeddings for news dev dataset.

In [None]:
news_dev_path = os.path.join(project_dir, 'MINDsmall_dev/news.tsv')

news_dev = pd.read_csv(news_dev_path, sep='\t', header=None, names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

In [None]:
news_dev["abstract"] = news_dev["abstract"].fillna("")
news_dev['proc_subcategory'] = news_dev.apply(lambda row: preprocess_subcategory(row['category'], row['subcategory']), axis=1)

news_dev["text"] = news_dev["category"] + " "\
+ news_dev["proc_subcategory"] + " "\
+ news_dev["title"] + " "\
+ news_dev["abstract"]

In [None]:
embeddings = []
for text in tqdm(news_dev["text"].fillna(""), desc="Generating BERT embeddings"):
    embedding = get_bert_embedding(text, tokenizer, model)
    embeddings.append(embedding)

In [None]:
news_dev["bert_embedding"] = embeddings

# embeddings_path = os.path.join(project_dir, "news_dev_bert_embeddings.pkl")

# with open(embeddings_path, "wb") as f:
#     pickle.dump(news_dev[["news_id", "bert_embedding"]], f)

### Approach 3: Extract entity_embedding.vec embeddings

In [None]:
def extract_wikidata_ids(entity_list):
    """
    Extracts Wikidata IDs from a list of entity dictionaries.
    If the entity_list is empty or malformed, returns an empty list.
    """
    try:
        entities = json.loads(entity_list)
        return [entity["WikidataId"] for entity in entities if "WikidataId" in entity]
    except (json.JSONDecodeError, TypeError):
        return []

In [None]:
news_train.isna().sum()

In [None]:
news_train["title_wikidata_ids"] = news_train["title_entities"].fillna("[]").apply(extract_wikidata_ids)
news_train["abstract_wikidata_ids"] = news_train["abstract_entities"].fillna("[]").apply(extract_wikidata_ids)

In [None]:
news_train[['title_wikidata_ids', 'abstract_wikidata_ids']].head()

In [None]:
news_train['title_wikidata_ids_len'] = news_train['title_wikidata_ids'].apply(lambda x: len(x))
news_train['abstract_wikidata_ids_len'] = news_train['abstract_wikidata_ids'].apply(lambda x: len(x))

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(news_train['title_wikidata_ids_len'], bins=50, kde=True)
plt.xlabel("Num Entities")
plt.title("Num of Title Entities")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(news_train['abstract_wikidata_ids_len'], bins=50, kde=True)
plt.xlabel("Num Entities")
plt.title("Num of Abstract Entities")
plt.show()

In [None]:
def load_entity_embeddings(filepath):
    entity_embeddings = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            entity = parts[0]  # Entity ID (assuming first column is entity)
            vector = np.array(parts[1:], dtype=np.float32)  # Embedding vector
            entity_embeddings[entity] = vector
    return entity_embeddings

In [None]:
entity_emb_path = os.path.join(project_dir, 'MINDsmall_train/entity_embedding.vec')
relation_emb_path = os.path.join(project_dir, 'MINDsmall_train/relation_embedding.vec')

In [None]:
entity_embeddings = load_entity_embeddings(entity_emb_path)
relation_embeddings = load_entity_embeddings(relation_emb_path)

In [None]:
def get_entity_embedding(entity_ids, entity_embeddings, embedding_dim=100):
    """Retrieve entity embeddings and take mean if multiple entities exist."""
    valid_embeddings = [entity_embeddings[eid] for eid in entity_ids if eid in entity_embeddings]

    if valid_embeddings:
        return np.mean(valid_embeddings, axis=0)  # Average if multiple
    else:
        return np.zeros(embedding_dim)  # Zero vector if no entity found

In [None]:
title_entity_vectors = []
abstract_entity_vectors = []

for _, row in tqdm(news_train.iterrows(), total=len(news_train), desc="Extracting entity embeddings"):
    # Get embeddings for title and abstract entities
    title_vector = get_entity_embedding(row["title_wikidata_ids"], entity_embeddings)
    abstract_vector = get_entity_embedding(row["abstract_wikidata_ids"], entity_embeddings)

    # Store results
    title_entity_vectors.append(title_vector)
    abstract_entity_vectors.append(abstract_vector)

In [None]:
title_entity_vectors = np.array(title_entity_vectors)
abstract_entity_vectors = np.array(abstract_entity_vectors)

# Create a final 200-dimensional entity embedding per article
news_train["entity_embedding"] = list(np.hstack((title_entity_vectors, abstract_entity_vectors)))

In [None]:
news_train.head()

In [None]:
news_dev_path = os.path.join(project_dir, 'MINDsmall_dev/news.tsv')
news_dev = pd.read_csv(news_dev_path, sep='\t', header=None, names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

In [None]:
news_dev["title_wikidata_ids"] = news_dev["title_entities"].fillna("[]").apply(extract_wikidata_ids)
news_dev["abstract_wikidata_ids"] = news_dev["abstract_entities"].fillna("[]").apply(extract_wikidata_ids)

In [None]:
entity_emb_path = os.path.join(project_dir, 'MINDsmall_dev/entity_embedding.vec')
entity_embeddings = load_entity_embeddings(entity_emb_path)

title_entity_vectors = []
abstract_entity_vectors = []

for _, row in tqdm(news_dev.iterrows(), total=len(news_dev), desc="Extracting entity embeddings"):
    # Get embeddings for title and abstract entities
    title_vector = get_entity_embedding(row["title_wikidata_ids"], entity_embeddings)
    abstract_vector = get_entity_embedding(row["abstract_wikidata_ids"], entity_embeddings)

    # Store results
    title_entity_vectors.append(title_vector)
    abstract_entity_vectors.append(abstract_vector)

title_entity_vectors = np.array(title_entity_vectors)
abstract_entity_vectors = np.array(abstract_entity_vectors)

# Create a final 200-dimensional entity embedding per article
news_dev["entity_embedding"] = list(np.hstack((title_entity_vectors, abstract_entity_vectors)))

In [None]:
news_dev.head()

In [None]:
# embeddings_path = os.path.join(project_dir, "news_train_entity_embeddings.pkl")

# with open(embeddings_path, "wb") as f:
#     pickle.dump(news_train[["news_id", "entity_embedding"]], f)

In [None]:
# embeddings_path = os.path.join(project_dir, "news_dev_entity_embeddings.pkl")

# with open(embeddings_path, "wb") as f:
#     pickle.dump(news_dev[["news_id", "entity_embedding"]], f)