# 1. Initialization

In [None]:
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import IterableDataset, DataLoader
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentence_embed_dim = 384

# 2. Import data

In [None]:
enc_movies = pl.read_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_movies_2.parquet")
enc_activities = pl.read_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_activities_2.parquet")

enc_movies = enc_movies.sort(pl.col("id"), descending=False)

with pl.Config(tbl_cols=-1):
    print(enc_movies)
    print(enc_activities)

In [None]:
enc_movies_2 = enc_movies.with_columns(
    pl.col("duration").fill_null(float('nan')),
    pl.col("enc_year").fill_null(float('nan')),
    pl.col("enc_month").fill_null(float('nan')),
    pl.col("enc_day").fill_null(float('nan')),
    pl.col("enc_desc").fill_null(torch.full((sentence_embed_dim, ), float('nan')).tolist()),
    pl.col("enc_genres").fill_null(torch.full((sentence_embed_dim, ), float('nan')).tolist()),
    pl.col("enc_languages").fill_null(torch.full((sentence_embed_dim, ), float('nan')).tolist()),
)

enc_activities_2 = enc_activities.with_columns(
    pl.col("rating").fill_null(float('nan')),
    pl.col("enc_review").fill_null(torch.full((sentence_embed_dim, ), float('nan')).tolist()),
).sort(pl.col("user_id"), descending=False)

with pl.Config(tbl_cols=-1):
    print(enc_movies_2)
    print(enc_activities_2)

- We will split user activities into train and validation dataset

In [None]:
user_num = enc_activities_2["user_id"].max()

train_activities = enc_activities_2.filter(pl.col("user_id").is_in([i + 1 for i in range(round(user_num*0.8))]))
val_activities = enc_activities_2.filter(
    pl.col("user_id").is_in([i + 1 for i in range(round(user_num*0.8), user_num)])
)

print(train_activities)
print(val_activities)

# 3. Create dataset for training

- First we need to create a helper function for flattening row into tensor

In [None]:
def flatten(row):
    flattened = []
    for val in row:
        if isinstance(val, list):
            flattened.extend(val)
        else:
            flattened.append(val)

    return torch.tensor(flattened, dtype=torch.float32, device=device)

- Now we will convert movies & user activities into tensors

In [None]:
embedded_movies = torch.stack([flatten(row) for row in enc_movies_2.iter_rows()])
print(f"Order: {enc_movies_2.columns}")
print(embedded_movies.shape)
print(embedded_movies)

train_data = torch.stack([flatten(row) for row in train_activities.iter_rows()])
print(f"Order: {enc_activities_2.columns}")
print(train_data.shape)
print(train_data)

val_data = torch.stack([flatten(row) for row in val_activities.iter_rows()])
print(f"Order: {enc_activities_2.columns}")
print(val_data.shape)
print(val_data)

- After that, lets create a DataLoader for our dataset

In [None]:
class Dataset(IterableDataset):
    def __init__(self, embedded_movies, activities, block_size, max_retries=10):
        super().__init__()
        self.embedded_movies = embedded_movies
        self.activities = activities
        self.block_size = block_size
        self.max_retries = max_retries

    
    def __iter__(self):
        while True:
            for _ in range(self.max_retries):
                  # get random user id
                  user_id = torch.randint(
                      int(self.activities[:, 0].min().item()),
                      int(self.activities[:, 0].max().item()),
                      (1,)
                  )

                  # retrieve the user
                  user_activities = self.activities[self.activities[:, 0] == user_id]
                  # Get random index for the start of sequence
                  idx = int(torch.randint(len(user_activities) - self.block_size, (1,)).item())

                  # get the input
                  aggregrated_input = []
                  input =  user_activities[idx:idx+self.block_size]
                  for activity in input:
                      movie_id = activity[1].item()
                      movie = self.embedded_movies[self.embedded_movies[:, 0] == movie_id][0]
                      activity = torch.cat((activity, movie[1:]))
                      aggregrated_input.append(activity)

                  input = torch.stack(aggregrated_input)

                  # Get the subsequent movies 
                  next_activities  = user_activities[idx+self.block_size+1:]
                  if len(next_activities) == 0:
                      continue
                  
                  # filter based on rating >= 3
                  next_activities = next_activities[next_activities[:, 3] >= 3/5]
                  if len(next_activities) == 0:
                      continue

                  next_movie_ids = next_activities[:, 1]
                  if next_movie_ids[0].item() == input[-1, 1].item():
                      # Check if next_movie_ids can be trimmed or not
                      if len(next_movie_ids) == 1:
                          continue

                      next_movie_ids = next_movie_ids[1:]

                  next_movie = self.embedded_movies[self.embedded_movies[:, 0] == next_movie_ids[0].item()][0]

                  yield input[:, 2:], next_movie[1:]

dataset = Dataset(embedded_movies, train_data, block_size=16)

it = iter(dataset)
for _ in range(5):
    x, y = next(it)
    print(x.shape)
    print(x)
    print(y.shape)
    print(y)

# 4. Build the model

In [None]:
class Head(nn.Module):
    def __init__(self, head_size, n_embd, dropout=0.2):
        super().__init__()
        self.qkv = nn.Linear(n_embd, 3 * head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        q, k, v = self.qkv(x).chunk(3, dim=-1)
        out = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout.p, is_causal=True)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:

class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.2):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, dropout)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)  
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
class FilmRecommender(nn.Module):
    def __init__(self, n_embd, n_head, n_layer, dropout=0.2):
        super().__init__()

        self.blocks = nn.Sequential(*[Block(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, n_embd-390) # Project from user activity to movie embedding

        # Training embeddings for missging value in movies
        self.missing_name_embedding = nn.Parameter(torch.randn(384))
        self.missing_desc_embedding = nn.Parameter(torch.randn(384))
        self.missing_genres_embedding = nn.Parameter(torch.randn(384))
        self.missing_languages_embedding = nn.Parameter(torch.randn(384))
        self.missing_duration = nn.Parameter(torch.randn(1))
        self.missing_release_year = nn.Parameter(torch.randn(1))
        self.missing_release_month = nn.Parameter(torch.randn(1))
        self.missing_release_day = nn.Parameter(torch.randn(1))

        # Training embeddings for missging value in activities
        self.missing_rating = nn.Parameter(torch.randn(1))
        self.missing_review_embedding = nn.Parameter(torch.randn(384))

    def fill_inputs(self, inputs):
        B, T, C = inputs.shape

        for i in range(B):
            for j in range(T):
                is_watch_vec = inputs[i][j][0]
                rating_vec = inputs[i][j][1]
                is_loved_vec = inputs[i][j][2]
                activity_year_vec = inputs[i][j][3]
                activity_month_vec = inputs[i][j][4]
                activity_day_vec = inputs[i][j][5]
                review_vec = inputs[i][j][6:390]
                duration_vec = inputs[i][j][390]
                release_year_vec = inputs[i][j][391]
                release_month_vec = inputs[i][j][392]
                release_day_vec = inputs[i][j][393]
                movie_name_vec = inputs[i][j][394:778]
                movie_desc_vec = inputs[i][j][778:1162]
                movie_genres_vec = inputs[i][j][1162:1546]
                movie_languages_vec = inputs[i][j][1546:]
            
                if torch.isnan(rating_vec):
                    inputs[i, j, 1] = self.missing_rating.item()
                if torch.any(torch.isnan(review_vec)):
                    inputs[i, j, 6:390] = self.missing_review_embedding
                if torch.isnan(duration_vec):
                    inputs[i, j, 390] = self.missing_duration.item()
                if torch.isnan(release_year_vec):
                    inputs[i, j, 391] = self.missing_release_year.item()
                if torch.isnan(release_month_vec):
                    inputs[i, j, 392] = self.missing_release_month.item()
                if torch.isnan(release_day_vec):
                    inputs[i, j, 393] = self.missing_release_day.item()

                if torch.any(torch.isnan(movie_name_vec)):
                    inputs[i, j, 394:778] = self.missing_name_embedding
                if torch.any(torch.isnan(movie_desc_vec)):
                    inputs[i, j, 778:1162] = self.missing_desc_embedding
                if torch.any(torch.isnan(movie_genres_vec)):
                    inputs[i, j, 1162:1546] = self.missing_genres_embedding
                if torch.any(torch.isnan(movie_languages_vec)):
                    inputs[i, j, 1546:] = self.missing_languages_embeddin

        return inputs
        

    def forward(self, inputs):
        x = self.blocks(inputs)
        x = self.ln_f(x)
        logits = self.lm_head(x) 

        return logits

# 5. Training & evaludation

In [None]:
epochs = 4000
eval_iters = 200
block_size = 32
batch_size = 128
n_embd = 1930
n_head = 10
n_layer = 10
dropout = 0.2
learning_rate = 1e-5

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
train_loader = DataLoader(Dataset(embedded_movies, train_data, block_size=block_size), batch_size=batch_size, shuffle=False)
train_data_iterator = iter(train_loader)
val_loader = DataLoader(Dataset(embedded_movies, val_data, block_size=block_size), batch_size=1, shuffle=False)
val_data_iterator = iter(val_loader)

for epoch in range(epochs):
    xb, yb = next(train_data_iterator)

    optimizer.zero_grad()
    out = model(xb)
    logits = out[:, -1, :]
    loss = criterion(logits, yb)
    loss.backward()
    optimizer.step()

    if epoch % 500 == 0:
        print(f"Epoch: {epoch} - Loss: {loss.item()}")

    if epoch % eval_iters == 0 and epoch > 0:
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb_val, yb_val in next(val_data_iterator):
                out_val = model(xb_val)
                logits_val = out_val[:, -1, :]
                val_loss += criterion(logits_val, yb_val).item()
        val_loss /= len(val_loader)
        print(f"Validation Loss: {val_loss:.6f}")
        model.train()