**1. Dependencies**

In [None]:
from datetime import datetime
import pandas as pd
import json

from transformers import (
    AutoModel,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForWholeWordMask,
    Trainer,
    TrainingArguments,
)

import torch
from torch.nn.functional import cosine_similarity
from torch.utils.data import DataLoader


**2. Load Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_users_pth = '/content/drive/MyDrive/CMSC472 FP/Data/MINDsmall_train/behaviors.tsv'
train_news_pth =  '/content/drive/MyDrive/CMSC472 FP/Data/MINDsmall_train/news.tsv'
dev_news_pth = '/content/drive/MyDrive/CMSC472 FP/Data/MINDsmall_dev/news.tsv'

users = pd.read_csv(train_users_pth, delimiter='\t', header=None)
news = pd.read_csv(train_news_pth, delimiter='\t', header=None)
dev = pd.read_csv(dev_news_pth, delimiter='\t', header=None)

**Preprocess Dataset**

In [None]:
def process_users(users):
    result = users.drop(columns=[2])
    result[3] = result[3].apply(lambda x: x.split() if isinstance(x, str) else [])
    result[4] = result[4].apply(lambda x: [item.split('-') for item in x.split()] if isinstance(x, str) else [])
    result.columns = ["impression_id", "user_id", "history", "impressions"]
    # print(result)
    return result

In [None]:
def extract_entities(col):
    return col.apply(
        lambda x: ' '.join([ent['Label'] for ent in json.loads(x)]) if isinstance(x, str) and x != '[]' else ''
    )

def process_news(news):
    result = news.drop(columns=[5])
    title_labels = extract_entities(result[6])
    abstract_labels = extract_entities(result[7])
    result[3] = result[3].fillna('')
    result[4] = result[4].fillna('')
    result['news_info'] = 'Category: ' + result[1] + ' SubCategory: ' + result[2] + ' Label1: ' + title_labels + ' Label2: ' + abstract_labels + ' Title: ' + result[3] + ' Abstract: ' + result[4]
    result = result.drop(columns=[3])
    result = result.drop(columns=[4])
    result = result.drop(columns=[6])
    result = result.drop(columns=[7])
    result.columns = ["news_id", "category", "sub_category", "news_info"]

    # print(result)
    return result

Users history and impressions

In [None]:
train_users = process_users(users)

Preprocessing news title, abstract

In [None]:
train_news = process_news(news)
dev_news = process_news(dev)

In [None]:
train_sentences = train_news['news_info'].apply(str.strip).tolist()
dev_sentences = dev_news['news_info'].apply(str.strip).tolist()
# print(dev_news['sub_category'][0])
# print(dev_news['news_info'][0])
print(dev_news['news_id'][0])

N55528


In [None]:
print(len(dev_sentences))


42416


In [None]:
count = 0
for sentence in dev_sentences:
    if len(sentence) < 512:
        count += 1

print(count)

31331


**Model Configuration**

In [None]:
model_name = 'roberta-base'
per_device_train_batch_size = 16

save_steps = 1000  # Save model every 1k steps
num_train_epochs = 3  # Number of epochs
use_fp16 = False  # Set to True, if your GPU supports FP16 operations
max_length = 512  # Max length for a text input
do_whole_word_mask = True  # If set to true, whole words are masked
mlm_prob = 0.15  # Probability that a word is replaced by a [MASK] token

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

NameError: name 'model_name' is not defined

**Save Directory**

In [None]:
output_dir = "/content/drive/MyDrive/Hybrid_Rec_Sys/bert-base-uncased-2024-11-29_05-49-59"
print("Save checkpoints to:", output_dir)

Save checkpoints to: /content/drive/MyDrive/Hybrid_Rec_Sys/bert-base-uncased-2024-11-29_05-49-59


**Training and Validation Dataset Class**

In [None]:
class TokenizedSentencesDataset:
    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):
        if not self.cache_tokenization:
            return self.tokenizer(
                self.sentences[item],
                add_special_tokens=True,
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True,
            )

        if isinstance(self.sentences[item], str):
            self.sentences[item] = self.tokenizer(
                self.sentences[item],
                add_special_tokens=True,
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True,
            )
        return self.sentences[item]

    def __len__(self):
        return len(self.sentences)


In [None]:
train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
# dev_sentences = []
dev_dataset = (
    TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True)
    if len(dev_sentences) > 0
    else None
)

In [None]:
if do_whole_word_mask:
    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)

**Trainer Args**

In [None]:
training_args_list = [TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=use_fp16,
    learning_rate=7e-05
), TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=use_fp16,
    learning_rate=3e-05)]



In [None]:
trainer_list = []
for training_args in training_args_list:
    trainer_list.append(Trainer(
        model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=dev_dataset
    ))

Save Tokenizer

In [None]:
print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)

Save tokenizer to: /content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37


('/content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37/tokenizer_config.json',
 '/content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37/special_tokens_map.json',
 '/content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37/vocab.json',
 '/content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37/merges.txt',
 '/content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37/added_tokens.json',
 '/content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37/tokenizer.json')

**Training**

In [None]:
for trainer in trainer_list:
    trainer.train()
    print("Save model to:", output_dir)
    model.save_pretrained(output_dir)

    print("Training finished")

Step,Training Loss,Validation Loss
1000,1.068,0.925363
2000,1.0562,0.897019
3000,1.0211,0.864732
4000,0.9644,0.841201
5000,0.9453,0.828586
6000,0.9168,0.796897
7000,0.8714,0.77054
8000,0.8441,0.751579
9000,0.8235,0.736352




Save model to: /content/drive/MyDrive/Hybrid_Rec_Sys/roberta-base-2024-11-27_04-12-37
Training finished




Step,Training Loss,Validation Loss


**Calling trained model to generate embeddings**

In [None]:
output_dir = "/content/drive/MyDrive/CMSC472 FP/trained_model/bert-base-uncased-2024-11-30_16-38-30"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModel.from_pretrained(output_dir)

Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/CMSC472 FP/trained_model/bert-base-uncased-2024-11-30_16-38-30 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the functions
def preprocess_news_info(dev_news):
    """
    Preprocess dev_news to filter out news_info entries with raw string length > 512.
    """
    dev_news = dev_news[dev_news['news_info'].str.len() < 512]
    return dev_news.reset_index(drop=True)

def get_embeddings(texts, tokenizer, model, device="cuda"):
    """
    Get embeddings for a batch of sentences.
    """
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
    return cls_embeddings

def calculate_batch_similarity(text1, batch_texts, tokenizer, model, device="cuda"):
    """
    Calculate similarities between a single text and a batch of other texts.
    """
    embedding1 = get_embeddings([text1], tokenizer, model, device)  # Embedding for the anchor sentence
    embedding2 = get_embeddings(batch_texts, tokenizer, model, device)  # Batch embeddings
    # Compute cosine similarity between text1 and all batch_texts
    similarities = torch.nn.functional.cosine_similarity(embedding1, embedding2).cpu().numpy()
    return similarities


In [None]:
group_users_by_news = {}
user_scores = {}

for _, row in train_users.iterrows():
    user_id = row["user_id"]
    his_list = row["history"]

    if user_id not in user_scores:
        user_scores[user_id] = 0

    for his in his_list:
        if his not in group_users_by_news:
            group_users_by_news[his] = set()
        group_users_by_news[his].add(user_id)

In [None]:
from collections import defaultdict



def get_user_rec(user, tokenizer, model, group_users_by_news, user_scores):

  def calculate_scores(user_id_to_set):
    for id in user_id_to_set:
      user_scores[id] = 1

    scores_by_his = {}
    for his, user_ids in group_users_by_news.items():
        scores_by_his[his] = sum(user_scores[user_id] for user_id in user_ids)

    sorted_scores = sorted(scores_by_his.items(), key=lambda x: -x[1])

    for id in user_id_to_set:
      user_scores[id] = 0

    return dict(list(scores_by_his.items())[:5])

  def get_score(news):
    if news not in group_users_by_news:
        return {}
    user_ids = group_users_by_news[news]
    return calculate_scores(user_ids)


  history = user["history"]
  top = defaultdict(int)
  for his in history:
    scores = get_score(his)

    for news, score in scores.items():
      top[news] += score

  sorted_top = dict(sorted(top.items()))
  if len(sorted_top) > 10:
    return dict(list(sorted_top.items())[:10])
  else:
    return sorted_top

In [None]:
print(train_news)

      news_id   category       sub_category  \
0      N55528  lifestyle    lifestyleroyals   
1      N19639     health         weightloss   
2      N61837       news          newsworld   
3      N53526     health             voices   
4      N38324     health            medical   
...       ...        ...                ...   
51277  N16909    weather  weathertopstories   
51278  N47585  lifestyle    lifestylefamily   
51279   N7482     sports        more_sports   
51280  N34418     sports         soccer_epl   
51281  N44276      autos        autossports   

                                               news_info  
0      Category: lifestyle SubCategory: lifestyleroya...  
1      Category: health SubCategory: weightloss Label...  
2      Category: news SubCategory: newsworld Label1: ...  
3      Category: health SubCategory: voices Label1:  ...  
4      Category: health SubCategory: medical Label1: ...  
...                                                  ...  
51277  Category: weath

In [None]:
sample = train_users.iloc[0]

rec = get_user_rec(sample, tokenizer, model, group_users_by_news, user_scores)

for news_id in rec.keys():
    title = train_news.loc[train_news['news_id'] == news_id, 'news_info'].values

    if title:
        print(f"News ID: {news_id}, Title: {title[0]}")
    else:
        print(f"News ID: {news_id} not found in train_news")


News ID: N18445, Title: Category: sports SubCategory: football_ncaa Label1:  Label2:  Title: Michigan sends breakup tweet to Notre Dame as series goes on hold Abstract: Parting is such sweet sorrow, say the Wolverines.
News ID: N34694, Title: Category: tv SubCategory: tvnews Label1: Rosie O'Donnell Label2: Rosie O'Donnell Title: Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now Abstract: Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now
News ID: N42782, Title: Category: sports SubCategory: baseball_mlb Label1: New York Yankees Houston Astros Label2: New York Yankees Title: Three takeaways from Yankees' ALCS Game 5 victory over the Astros Abstract: The Yankees kept hope alive thanks to some impressive starting pitching and a pair of early home runs.
News ID: N45794, Title: Category: news SubCategory: newscrime Label1:  Label2: Miami International Airport American Airlines Title: Four flight attendants were arrested in Miami's airport a

In [None]:
for news_id in sample['history']:
    title = train_news.loc[train_news['news_id'] == news_id, 'news_info'].values

    if title:
        print(f"News ID: {news_id}, Title: {title[0]}")
    else:
        print(f"News ID: {news_id} not found in train_news")

News ID: N55189, Title: Category: tv SubCategory: tvnews Label1:  Label2: Pat Sajak Wheel of Fortune (American game show) Cardiff-by-the-Sea, Encinitas, California California Title: 'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction Abstract: We'd like to solve the puzzle, Pat: Blair Davis' loveless marriage? On Monday, "Wheel of Fortune" welcomed as a new contestant trucking business owner Blair Davis, who offered a biting introduction for himself. When host Pat Sajak asked the man from Cardiff, California, about his family, Davis plunged into one of the darkest personal summaries the show has likely ever heard. "I've been trapped in a loveless marriage for the last 12 years to an...
News ID: N42782, Title: Category: sports SubCategory: baseball_mlb Label1: New York Yankees Houston Astros Label2: New York Yankees Title: Three takeaways from Yankees' ALCS Game 5 victory over the Astros Abstract: The Yankees kept hope alive thanks to some impressive starting pitchin