# 1. Initialization

In [None]:
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import IterableDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Import data

In [None]:
enc_movies = pl.read_parquet("encoded_movies_2.parquet")
enc_activities = pl.read_parquet("encoded_activities_2.parquet")
# enc_movies = pl.read_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_movies_2.parquet")
# enc_activities = pl.read_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_activities_2.parquet")

enc_movies = enc_movies.sort(pl.col("id"), descending=False)
enc_activities = enc_activities.sort(
    ["user_id", "enc_year", "enc_month", "enc_day", "enc_hour", "enc_minute"], descending=False,
)

with pl.Config(tbl_cols=-1, tbl_rows=-1):
    print(enc_movies.head(50))
    print(enc_activities.head(50))

- We will split user activities into train and validation dataset

In [None]:
user_num = enc_activities["user_id"].max()

train_activities = enc_activities.filter(pl.col("user_id").is_in([i + 1 for i in range(round(user_num*0.8))]))
val_activities = enc_activities.filter(
    pl.col("user_id").is_in([i + 1 for i in range(round(user_num*0.8), user_num)])
)

print(train_activities)
print(val_activities)

# 3. Create dataset for training

- First we need to create a helper function for flattening row into tensor

In [None]:
def flatten(row):
    flattened = []
    for val in row:
        if isinstance(val, list):
            flattened.extend(val)
        else:
            flattened.append(val)

    return torch.tensor(flattened, dtype=torch.float32)

- Now we will convert movies & user activities into tensors

In [None]:
embedded_movies = torch.stack([flatten(row) for row in enc_movies.iter_rows()])
print(f"Order: {enc_movies.columns}")
print(embedded_movies.shape)
print(embedded_movies)

train_data = torch.stack([flatten(row) for row in train_activities.iter_rows()])
print(f"Order: {enc_activities.columns}")
print(train_data.shape)
print(train_data)

val_data = torch.stack([flatten(row) for row in val_activities.iter_rows()])
print(f"Order: {enc_activities.columns}")
print(val_data.shape)
print(val_data)

- After that, lets create a DataLoader for our dataset

In [None]:
class Dataset(IterableDataset):
    def __init__(self, embedded_movies, activities, block_size, max_retries=10):
        super().__init__()
        self.embedded_movies = embedded_movies
        self.activities = activities
        self.block_size = block_size
        self.max_retries = max_retries

    
    def __iter__(self):
        while True:
            for _ in range(self.max_retries):
                  # get random user id
                  user_id = torch.randint(
                      int(self.activities[:, 0].min().item()),
                      int(self.activities[:, 0].max().item()),
                      (1,),
                  )

                  # retrieve the user
                  user_activities = self.activities[self.activities[:, 0] == user_id]
                  if len(user_activities) < self.block_size:
                      continue

                  # Get random index for the start of sequence
                  idx = int(torch.randint(len(user_activities) - self.block_size, (1,)).item())

                  # get the input
                  aggregrated_input = []
                  input =  user_activities[idx:idx+self.block_size]
                  for activity in input:
                      movie_id = activity[1].item()
                      movie = self.embedded_movies[self.embedded_movies[:, 0] == movie_id][0]
                      activity = torch.cat((activity, movie[1:]))
                      aggregrated_input.append(activity)

                  input = torch.stack(aggregrated_input)

                  # Get the subsequent movies 
                  next_activities  = user_activities[idx+self.block_size+1:]
                  if len(next_activities) == 0:
                      continue
                  
                  # filter based on rating >= 3
                  next_activities = next_activities[next_activities[:, 3] >= 3/5]
                  if len(next_activities) == 0:
                      continue

                  next_movie_ids = next_activities[:, 1]
                  if next_movie_ids[0].item() == input[-1, 1].item():
                      # Check if next_movie_ids can be trimmed or not
                      if len(next_movie_ids) == 1:
                          continue

                      next_movie_ids = next_movie_ids[1:]

                  next_movie = self.embedded_movies[self.embedded_movies[:, 0] == next_movie_ids[0].item()][0]

                  # Keep the movie id for both input and target for later validation
                  yield input[:, 1:], next_movie

dataset = Dataset(embedded_movies, train_data, block_size=16)

it = iter(dataset)
for _ in range(5):
    x, y = next(it)
    print(x.shape)
    print(x)
    print(y.shape)
    print(y)

# 4. Build the model

In [None]:
class Head(nn.Module):
    def __init__(self, head_size, n_embd, dropout=0.2):
        super().__init__()
        self.qkv = nn.Linear(n_embd, 3 * head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        q, k, v = self.qkv(x).chunk(3, dim=-1)
        out = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout.p, is_causal=False)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:

class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.2):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, dropout)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)  
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
class FilmRecommender(nn.Module):
    def __init__(self, n_embd, n_head, n_layer, dropout=0.2):
        super().__init__()

        self.blocks = nn.Sequential(*[Block(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.mask_embedding = nn.Parameter(torch.randn(n_embd))
        self.proj = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU(),
            nn.Linear(n_embd, n_embd-392),
        )

    def forward(self, inputs):
        mask_token = self.mask_embedding.unsqueeze(0).unsqueeze(0)
        mask_token = mask_token.expand(inputs.size(0), -1, -1)
        inputs = torch.cat([inputs, mask_token], dim=1)

        x = self.blocks(inputs)
        x = self.ln_f(x)
        logits = self.proj(x[:, -1, :]) 

        return logits

# 5. Training & evaludation

In [None]:
# NOTE: The input need to have the id trimmed
def top_n_closest_movie(movie_embedding, n=10):
    norm_embedded_movies = F.normalize(embedded_movies[:, 1:], dim=-1)
    norm_movie_embedding = F.normalize(movie_embedding, dim=-1)

    sims = torch.matmul(norm_movie_embedding, norm_embedded_movies.T).squeeze()

    sorted_sims, idx = torch.sort(sims, descending=True)
    return sorted_sims[:n], embedded_movies[:, 0][idx[:n]]

harry_potter_2 = embedded_movies[embedded_movies[:, 0] == 801][0, 1:]
sims, idx = top_n_closest_movie(harry_potter_2)
print(idx.tolist())
print(sims.tolist())
     

In [None]:
epochs = 20000
eval_iters = 500
block_size = 64
batch_size = 8
n_embd = 2316
n_head = 12
n_layer = 12
dropout = 0.2
learning_rate = 1e-5
n = 200

In [None]:
model = FilmRecommender(n_embd, n_head, n_layer, dropout).to(device)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.4f}B")
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01)
criterion = nn.CosineEmbeddingLoss(margin=0)
train_loader = DataLoader(Dataset(embedded_movies, train_data, block_size=block_size), batch_size=batch_size, shuffle=False)
train_data_iterator = iter(train_loader)
val_loader = DataLoader(Dataset(embedded_movies, val_data, block_size=block_size), batch_size=1, shuffle=False)
val_data_iterator = iter(val_loader)

In [None]:
train_losses = []
eval_losses = []

for epoch in range(epochs):
    xb, yb = next(train_data_iterator)
    xb, yb = xb.to(device), yb.to(device)

    optimizer.zero_grad()
    # Remove movie id from input and target
    out = model(xb[:, :, 1:])
    loss = criterion(out, yb[:, 1:], torch.ones(out.shape[0], device=device))
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    if epoch % 100 == 0:
        print(f"Epoch: {epoch} - Loss: {loss.item()}")

    if epoch % eval_iters == 0 and epochs != 0:
        with torch.no_grad():
            xv, yv = next(val_data_iterator)
            xv, yv = xv.to(device), yv.to(device)

            out = model(xv[:, :, 1:])
            eval_loss = criterion(out, yv[:, 1:], torch.ones(out.shape[0], device=device))
            eval_losses.append(eval_loss.item())

            idx = int(torch.randint(out.shape[0], (1,)).item())
            rand_out, rand_target = out[idx], yv[:, 1:][idx]
            sim = F.cosine_similarity(rand_out, rand_target, dim=0)
            top_sims, top_movie_ids = top_n_closest_movie(rand_out.to("cpu"), n=n)
            rank = len(top_sims[top_sims >= sim.to("cpu")])

            print(f"Evaluation loss: {eval_loss.item()}")
            print(f"Rank: {rank} - {'Pass' if rank < n else 'Fail'}")

torch.save(model.state_dict(), "film_recommender.pt")
print(f"Model saved to film_recommender.pt")

# --- Plot losses ---
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Training Loss", linewidth=1)
if eval_losses:
    eval_x = [i * eval_iters for i in range(1, len(eval_losses) + 1)]
    plt.plot(eval_x, eval_losses, label="Evaluation Loss", linewidth=2)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Evaluation Loss")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.savefig("loss.png")
plt.show()