<a href="https://colab.research.google.com/github/joelpolizzi/DSC210-group_proj/blob/main/Transformer_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Personalized Recommendations with Transformers

## Installing and importing packages

In [1]:
!python3 -m pip install --upgrade pip



In [2]:
!python3 -m pip install \
datasets \
implicit \
scikit-surprise



In [6]:
import gzip
import random
import scipy
import pandas as pd
# import tensorflow as tf

from collections import defaultdict
from datasets import load_dataset
from implicit import bpr
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

import time
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
# from torch.utils.data import dataset
# from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from collections import Counter

from zipfile import ZipFile
from urllib.request import urlretrieve

import numpy as np

## Loading and processing data

In [75]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Musical_Instruments", trust_remote_code=True)
print(dataset["full"][0])
dataset_items = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Musical_Instruments", trust_remote_code=True)
print(dataset_items["full"][0])

#Print our splits
print(dataset.keys())
print(len(dataset["full"]))
print(dataset_items.keys())
print(len(dataset_items["full"]))

{'rating': 5.0, 'title': 'Five Stars', 'text': 'Great headphones, comfortable and sound is good. No complaints. I would buy again.', 'images': [], 'asin': 'B003LPTAYI', 'parent_asin': 'B003LPTAYI', 'user_id': 'AGKASBHYZPGTEPO6LWZPVJWB2BVA', 'timestamp': 1452650586000, 'helpful_vote': 0, 'verified_purchase': True}
{'main_category': 'Musical Instruments', 'title': 'Pearl Export Lacquer EXL725S/C249 5-Piece New Fusion Drum Set with Hardware, Honey Amber', 'average_rating': 4.2, 'rating_number': 22, 'features': ['Item may ship in more than one box and may arrive separately', '(22x18, 10x7, 12x8, 16x16, 14x5.5)', 'P930 Demonator Pedal', '830 Hardware Pack', 'Matching snare, REMO snare batter side head'], 'description': ["Introducing the best selling drum set of all time... Export Series returns and this time with a lacquer finish. EXL Export Lacquer Series incorporates Pearl's S.S.T. Superior Shell Technology, Opti-Loc tom mounts, all-new 830 Series Hardware with a P-930 Pedal, and a choice

In [76]:
from surprise import Dataset, Reader
df = pd.DataFrame(dataset['full'][:len(dataset['full']) // 10]).sample(frac=0.5)
print(df.head())

print(df.dtypes)
reader = Reader(rating_scale=(1, 5))

surprise_data = Dataset.load_from_df(df[['title', 'text', 'rating']], reader)

        rating                  title  \
101634     5.0           Good product   
22914      5.0            Music items   
106836     3.0  not bad, not great...   
254876     4.0  Quick, Easy & Works !   
170442     5.0                   Good   

                                                     text  \
101634  Very nice and smooth switch, it has some weigh...   
22914                                  My son loves it!!!   
106836  My problem with this is mostly physical...<br ...   
254876  [[VIDEOID:79d38c2c24de7d02b1a3a625b9dee531]] F...   
170442                          Works good with Bluetooth   

                                                   images        asin  \
101634                                                 []  B07K67HRDN   
22914                                                  []  B07GV76CVV   
106836                                                 []  B00FQRW550   
254876  [{'attachment_type': 'IMAGE', 'large_image_url...  B0BKL45WFZ   
170442               

In [77]:
df["user_id"] = df["user_id"].apply(lambda x: f"user_{x}")
df["item_id"] = df["asin"].apply(lambda x: f"item_{x}")

In [78]:
# Genarting a list of unique item ids
item_ids = df.item_id.unique()

# Counter is used to feed items to item_vocab
item_counter = Counter(item_ids)

# Generating vocabulary for items
# Start by creating a mapping for items (assign indices manually)
special_token = '<unk>'
item_vocab_stoi = {special_token: 0}  # Add special token at index 0
item_vocab_stoi.update({item: idx for idx, item in enumerate(item_ids, start=1)})

# Item to title mapping dictionary
item_title_dict = dict(zip(df.item_id, df.title))

# Similarly generating a vocabulary for user ids
user_ids = df['user_id'].unique()
user_counter = Counter(user_ids)

# Generate vocabulary for users
user_vocab_stoi = {special_token: 0}  # Add special token at index 0
user_vocab_stoi.update({user: idx for idx, user in enumerate(user_ids, start=1)})

# Reverse mapping (index to user) if needed
user_vocab_itos = {idx: user for user, idx in user_vocab_stoi.items()}

In [79]:
# Group ratings by user_id in order of increasing timestamp.
ratings_group = df.sort_values(by=["timestamp"]).groupby("user_id")

In [80]:
ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "item_id": list(ratings_group.item_id.apply(list)),
        "timestamps": list(ratings_group.timestamp.apply(list)),
    }
)

In [81]:
# Sequence length, min history count and window slide size
sequence_length = 4
min_history = 1
step_size = 2

# Creating sequences from lists with sliding window
def create_sequences(values, window_size, step_size, min_history):
  sequences = []
  start_index = 0
  while len(values[start_index:]) > min_history:
    seq = values[start_index : start_index + window_size]
    sequences.append(seq)
    start_index += step_size
  return sequences

ratings_data.item_id = ratings_data.item_id.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size, min_history)
)

del ratings_data["timestamps"]

# Sub-sequences are exploded.
# Since there might be more than one sequence for each user.
ratings_data_transformed = ratings_data[["user_id", "item_id"]].explode(
    "item_id", ignore_index=True
)

# Remove rows where 'item_id' contains NaN or empty sequences
ratings_data_transformed = ratings_data_transformed.dropna(subset=['item_id'])
ratings_data_transformed = ratings_data_transformed[ratings_data_transformed['item_id'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

ratings_data_transformed.rename(
    columns={"item_id": "sequence_item_id"},
    inplace=True,
)



In [83]:
# Random indexing
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85

# Split train data
df_train_data = ratings_data_transformed[random_selection]
train_data_raw = df_train_data[["user_id", "sequence_item_id"]].values

# Split test data
df_test_data = ratings_data_transformed[~random_selection]
test_data_raw = df_test_data[["user_id", "sequence_item_id"]].values

In [84]:
# Pytorch Dataset for user interactions
class ItemSeqDataset(Dataset):
    # Initialize dataset
    def __init__(self, data, item_vocab_stoi, user_vocab_stoi):
        self.data = data
        self.item_vocab_stoi = item_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi


    def __len__(self):
        return len(self.data)

    # Fetch data from the dataset
    def __getitem__(self, idx):
        user, item_sequence = self.data[idx]
        # Directly index into the vocabularies
        item_data = [self.item_vocab_stoi[item] for item in item_sequence]
        user_data = self.user_vocab_stoi[user]
        return torch.tensor(item_data), torch.tensor(user_data)


# Collate function and padding
def collate_batch(batch):
    item_list = [item[0] for item in batch]
    user_list = [item[1] for item in batch]
    return pad_sequence(item_list, padding_value=item_vocab_stoi['<unk>'], batch_first=True), torch.stack(user_list)


BATCH_SIZE = 256
# Create instances of your Dataset for each set
train_dataset = ItemSeqDataset(train_data_raw, item_vocab_stoi, user_vocab_stoi)
val_dataset = ItemSeqDataset(test_data_raw, item_vocab_stoi, user_vocab_stoi)
# Create DataLoaders
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                      shuffle=False, collate_fn=collate_batch)

## Model definition

In [85]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, nuser: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        # positional encoder
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Multihead attention mechanism.
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        # Embedding layers
        self.movie_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)

        # Defining the size of the input to the model.
        self.d_model = d_model

        # Linear layer to map the output tomovie vocabulary.
        self.linear = nn.Linear(2*d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers.
        initrange = 0.1
        self.movie_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, user: Tensor, src_mask: Tensor = None) -> Tensor:
        # Embedding movie ids and userid
        movie_embed = self.movie_embedding(src) * math.sqrt(self.d_model)
        user_embed = self.user_embedding(user) * math.sqrt(self.d_model)

        # positional encoding
        movie_embed = self.pos_encoder(movie_embed)

        # generating output with final layers
        output = self.transformer_encoder(movie_embed, src_mask)

        # Expand user_embed tensor along the sequence length dimension
        user_embed = user_embed.expand(-1, output.size(1), -1)

        # Concatenate user embeddings with transformer output
        output = torch.cat((output, user_embed), dim=-1)

        output = self.linear(output)
        return output

In [86]:
ntokens = len(item_vocab_stoi)  # size of vocabulary
nusers = len(user_vocab_stoi)
emsize = 128  # embedding dimension
d_hid = 128  # dimension of the feedforward network model
nlayers = 2  # number of ``nn.TransformerEncoderLayer``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerModel(ntokens, nusers, emsize, nhead, d_hid, nlayers, dropout).to(device)

criterion = nn.CrossEntropyLoss()
lr = 1.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)



In [87]:
def train(model: nn.Module, train_iter, epoch) -> None:
    # Switch to training mode
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    for i, (item_data, user_data) in enumerate(train_iter):
        # Load movie sequence and user id
        item_data, user_data = item_data.to(device), user_data.to(device)
        user_data = user_data.reshape(-1, 1)

        # Split movie sequence to inputs and targets
        inputs, targets = item_data[:, :-1], item_data[:, 1:]
        targets_flat = targets.reshape(-1)

        # Predict movies
        output = model(inputs, user_data)
        output_flat = output.reshape(-1, ntokens)

        # Backpropogation process
        loss = criterion(output_flat, targets_flat)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        # Results
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [88]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    # Switch the model to evaluation mode.
    # This is necessary for layers like dropout,
    model.eval()
    total_loss = 0.

    with torch.no_grad():
        for i, (item_data, user_data) in enumerate(eval_data):
            # Load item sequence and user id
            item_data, user_data = item_data.to(device), user_data.to(device)
            user_data = user_data.reshape(-1, 1)
            # Split item sequence to inputs and targets
            inputs, targets = item_data[:, :-1], item_data[:, 1:]
            targets_flat = targets.reshape(-1)
            # Predict item
            output = model(inputs, user_data)
            output_flat = output.reshape(-1, ntokens)
            # Calculate loss
            loss = criterion(output_flat, targets_flat)
            total_loss += loss.item()
    return total_loss / (len(eval_data) - 1)

In [89]:
best_val_loss = float('inf')
epochs = 10

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        # Training
        train(model, train_iter, epoch)

        # Evaluation
        val_loss = evaluate(model, val_iter)

        # Compute the perplexity of the validation loss
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time

        # Results
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Item Sequence for idx 1441: ['item_B07HDB12FM', 'item_B00BHQXM8S', 'item_B0160ACNAI', 'item_B07TXZGRNQ']
Item Sequence for idx 1442: ['item_B0002DV7ZM', 'item_B00P9QAH6W']
Item Sequence for idx 1443: ['item_B01E6T56EA', 'item_B07PNF5WJK']
Item Sequence for idx 1444: ['item_B0002H0H0E', 'item_B001LXMXU0', 'item_B0778PW65J']
Item Sequence for idx 1445: ['item_B001OQCC3W', 'item_B001OQCC50', 'item_B008YRW6CQ', 'item_B07993X771']
Item Sequence for idx 1446: ['item_B07CXN5Y7D', 'item_B082VSRLKB', 'item_B0921VB2D4', 'item_B09X785GLV']
Item Sequence for idx 1447: ['item_B07XCQFG8S', 'item_B07ZRYKNGS']
Item Sequence for idx 1448: ['item_B00PWXA1SG', 'item_B00302DN2C', 'item_B01DA8NFFA', 'item_B012S3KZWE']
Item Sequence for idx 1449: ['item_B00HWTA7DM', 'item_B016APL8RC']
Item Sequence for idx 1450: ['item_B00EDSUQWM', 'item_B00WZQSBBW', 'item_B001A62VAU', 'item_B01I47LBK8']
Item Sequence for idx 1451: ['item_B001A62VAU', 'item_B0

  model.load_state_dict(torch.load(best_model_params_path)) # load best model states


In [99]:
def get_popular_items(df_ratings):
  # Calculate the number of ratings for each item
  rating_counts = df['item_id'].value_counts().reset_index()
  rating_counts.columns = ['item_id', 'rating_count']

  # Get the most frequently rated items
  min_ratings_threshold = rating_counts['rating_count'].quantile(0.95)

  # Filter items based on the minimum number of ratings
  popular_items = df.merge(rating_counts, on='item_id')
  popular_items = popular_items[popular_items['rating_count'] >= min_ratings_threshold]

  # Calculate the average rating for each item
  average_ratings = popular_items.groupby('item_id')['rating'].mean().reset_index()

  # Get the top 10 rated items
  top_10_items = list(average_ratings.sort_values('rating', ascending=False).head(10).item_id.values)
  return top_10_items

In [None]:
# A placeholders to store results of recommendations
transformer_reco_results = list()
popular_reco_results = list()

# Get top 10 items
k = 10
# Iterate over the validation data
for i, (item_data, user_data) in enumerate(val_iter):
    # Feed the input and get the outputs
    item_data, user_data = item_data.to(device), user_data.to(device)
    user_data = user_data.reshape(-1, 1)
    inputs, targets = item_data[:, :-1], item_data[:, 1:]
    output = model(inputs, user_data)
    output_flat = output.reshape(-1, ntokens)
    targets_flat = targets.reshape(-1)

    # Reshape the output_flat to get top predictions
    outputs = output_flat.reshape(output_flat.shape[0] // inputs.shape[1],
                                  inputs.shape[1],
                                  output_flat.shape[1])[: , -1, :]
    # k + inputs.shape[1] = 13 items obtained
    # In order to prevent to recommend already watched items
    values, indices = outputs.topk(k + inputs.shape[1], dim=-1)

    for sub_sequence, sub_indice_org in zip(item_data, indices):
        sub_indice_org = sub_indice_org.cpu().detach().numpy()
        sub_sequence = sub_sequence.cpu().detach().numpy()

        # Generate mask array to eliminate already watched item
        mask = np.isin(sub_indice_org, sub_sequence[:-1], invert=True)

        # After masking get top k items
        sub_indice = sub_indice_org[mask][:k]

        # Generate results array
        transformer_reco_result = np.isin(sub_indice, sub_sequence[-1]).astype(int)

        # Decode items to search in popular items
        if sub_sequence[-1] in item_vocab_stoi:
            target_item_decoded = item_vocab_stoi[sub_sequence[-1]]
        else:
            # Handle missing item (skip, set to default, etc.)
            target_item_decoded = None  # or some default value

        top_10_items = get_popular_items(df)

        popular_reco_result = np.isin(top_10_items, target_item_decoded).astype(int)

        transformer_reco_results.append(transformer_reco_result)
        popular_reco_results.append(popular_reco_result)

Item Sequence for idx 0: ['item_B01EUG7ENA', 'item_B00XBQ8UGG', 'item_B0007L8BQW', 'item_B075HCLMYP']
Item Sequence for idx 1: ['item_B0051NSNFK', 'item_B07N1W86NK']
Item Sequence for idx 2: ['item_B004NFPZLM', 'item_B0002E3A8S', 'item_B000FZ1KF4', 'item_B0009R1TUY']
Item Sequence for idx 3: ['item_B000FZ1KF4', 'item_B0009R1TUY', 'item_B0050P09WY', 'item_B003CUJEKC']
Item Sequence for idx 4: ['item_B0043WF8PI', 'item_B00PC17212', 'item_B000T4F9WG', 'item_B004VDD4KK']
Item Sequence for idx 5: ['item_B00QXF3XTQ', 'item_B000EEJ6OI', 'item_B009A3B958', 'item_B003B0HLK8']
Item Sequence for idx 6: ['item_B01BOYPXXO', 'item_B0071UH32A', 'item_B000EENLZI', 'item_B0719KM5Y8']
Item Sequence for idx 7: ['item_B0013V1BYY', 'item_B00KCXMBES', 'item_B002QAUOKS', 'item_B00QCMEXIU']
Item Sequence for idx 8: ['item_B000EEJ91I', 'item_B000L6GD04', 'item_B0002D0LM2', 'item_B003986L1W']
Item Sequence for idx 9: ['item_B0851WNWCH', 'item_B07NR4NX5M']
Item Sequence for idx 10: ['item_B07C7RR75R', 'item_B004

In [1]:
from sklearn.metrics import ndcg_score

# Since we have already sorted our recommendations
# An array that represent our recommendation scores is used.
representative_array = [[i for i in range(k, 0, -1)]] * len(transformer_reco_results)

for k in [3, 5, 10]:
  transformer_result = ndcg_score(transformer_reco_results,
                                  representative_array, k=k)
  popular_result = ndcg_score(popular_reco_results,
                              representative_array, k=k)

  print(f"Transformer NDCG result at top {k}: {round(transformer_result, 4)}")
  print(f"Popular recommendation NDCG result at top {k}: {round(popular_result, 4)}\n\n")

NameError: name 'k' is not defined