# 1. Initialization

In [None]:
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import IterableDataset, DataLoader
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Import data

In [None]:
enc_movies = pl.read_parquet("encoded_movies_2.parquet")
enc_activities = pl.read_parquet("encoded_activities_2.parquet")

enc_movies = enc_movies.sort(pl.col("id"), descending=False)

with pl.Config(tbl_cols=-1):
    print(enc_movies)
    print(enc_activities)

# 3. Create dataset for training

In [None]:
def flatten(row):
    flattened = []
    for val in row:
        if isinstance(val, list):
            flattened.extend(val)
        else:
            flattened.append(val)

    return torch.tensor(flattened, dtype=torch.float32, device=device)

In [None]:
embd_matrix = torch.stack([flatten(row) for row in enc_movies.drop(["id", "name"]).iter_rows()])
embd_matrix.shape

In [None]:
class UserActivitiyDataset(IterableDataset):
    def __init__(self, enc_movies, enc_activities, block_size, max_retries=10):
        self.enc_movies = enc_movies
        self.enc_activities = enc_activities
        self.block_size =  block_size
        self.max_retries =  max_retries

    def __iter__(self):
        while True:
            for _ in range(self.max_retries):
                idx = random.randint(
                    self.enc_activities["user_id"].min(),
                    self.enc_activities["user_id"].max(),
                )
                user_activities = self.enc_activities.filter(pl.col("user_id") == idx).sort(pl.col("enc_time"), descending=False)
                ix = random.randint(0, len(user_activities) - self.block_size - 1)
                input_user_activities = user_activities[ix:ix + self.block_size]

                x = input_user_activities.join(
                    self.enc_movies, left_on="movie_id", right_on="id", how="inner"
                ).sort(pl.col("enc_time"), descending=False)

                next_movies = user_activities[ix + self.block_size + 1:].sort(
                    pl.col("enc_time"), descending=False
                )

                if len(next_movies) == 0:
                    continue

                movie_id = next_movies.row(0, named=True)["movie_id"]
                y = self.enc_movies.with_row_index("index").filter(pl.col("index") == movie_id)

                if len(y) == 0:
                    continue

                watched_movies = x.join(self.enc_movies.with_row_index("index"), left_on="movie_id", right_on="id", how="inner")

                yield torch.stack([flatten(row) for row in x.drop(["user_id", "movie_id", "name"]).iter_rows()]), y[0]["index"].to_torch().to(device=device), watched_movies["index"].to_list()


train_data = UserActivitiyDataset(enc_movies, enc_activities.filter(pl.col("user_id").is_in([1,2,3,4,5])), 16)
val_data = UserActivitiyDataset(enc_movies, enc_activities.filter(pl.col("user_id").is_in([6])), 16)

data_iter = iter(val_data)
x, y, l = next(data_iter)
print(x.shape)
print(y)
print(l)

del x, y, l

In [None]:

for batch_id, (xb, yb, _) in enumerate(DataLoader(train_data, batch_size=4, shuffle=False)):
    print(xb.shape)
    print(yb.shape)
    break

# 4. Build the model

- First let specify the components of a transformer

In [None]:
class Head(nn.Module):
    def __init__(self, head_size, n_embd, dropout=0.2):
        super().__init__()
        self.qkv = nn.Linear(n_embd, 3 * head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        q, k, v = self.qkv(x).chunk(3, dim=-1)
        out = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout.p, is_causal=True)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.2):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, dropout)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)  
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


- Now we will assemble our transformer model

In [None]:
class FilmRecommender(nn.Module):
    def __init__(self, n_embd, n_head, n_layer, dropout=0.2):
        super().__init__()

        self.blocks = nn.Sequential(*[Block(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, n_embd-6)

    def forward(self, inputs):
        x = self.blocks(inputs)
        x = self.ln_f(x)
        logits = self.lm_head(x) 

        return logits

# 4. Build the model

In [None]:
model = FilmRecommender(n_embd, n_head, n_layer, dropout).to(device)

In [None]:
epochs = 4000
eval_iters = 200
batch_size = 128
n_embd = 610
n_head = 10
n_layer = 6
dropout = 0.2
learning_rate = 1e-4

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)
train_data_iterator = iter(train_loader)
val_loader = DataLoader(val_data, batch_size=1, shuffle=False)
val_data_iterator = iter(val_loader)

for epoch in range(epochs):
    xb, yb, _ = next(train_data_iterator)

    optimizer.zero_grad()
    out = model(xb)
    out = out[:, -1, :]
    logits = out @ embd_matrix.T
    yb = yb.squeeze(1)
    loss = criterion(logits, yb)
    loss.backward()
    optimizer.step()

    if epoch % 500 == 0:
        print(f"Epoch: {epoch} - Loss: {loss.item()}")

right = 0
wrong = 0

with torch.no_grad():
    for step in range(eval_iters):
        xv, yv, movies_list = next(val_data_iterator)
        out = model(xv)
        out = out[:, -1, :]
        logits = out @ embd_matrix.T
        yv = yv.squeeze(1)
        loss = criterion(logits, yv)

        watched_movies = enc_movies.with_row_index("index").filter(pl.col("index").is_in(movies_list))["name"].to_list()
        movie_pred = enc_movies.with_row_index("index").filter(pl.col("index") == torch.argmax(F.softmax(logits[0], dim=0)).item())[0, "name"]
        movie_target = enc_movies.with_row_index("index").filter(pl.col("index") == yv[0].item())[0, "name"]

        print(torch.argmax(F.softmax(logits[0], dim=0)).item())
        print(yv)

        print(f"Movies watched: {watched_movies}")
        print(f"predicted next movie: {movie_pred}")
        print(f"actual next movie: {movie_target}")

        if movie_pred == movie_target:
            right = right + 1
        else:
            wrong = wrong + 1

        if step % 10 == 0:
            print(f"Step {step} - Validation loss: {loss.item()}")

    print(f"Accuracy: {right}/{right+wrong}")