In [1]:
import random

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD


In [2]:
data_csv_path = "../data/ml-25m/ratings.csv"
movies_path = "../data/ml-25m/movies.csv"

model_path = "recommender_models/recommender_40epochs.ckpt"

This will be for the default Bert4Rec model, we use the Movie Lens Dataset to train it with 40 epochs. Each epochs requires 20 mins with 16GB Vram in GPU

In [44]:
data = pd.read_csv(data_csv_path)
movies = pd.read_csv(movies_path)


In [45]:
data.sort_values(by="timestamp", inplace=True)

In [46]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [47]:
random.sample(list(grp_by_train.groups), k=10)

[46905, 126716, 69895, 154829, 65154, 65695, 143793, 3025, 37186, 29427]

In [7]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [48]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [49]:
def predict(list_movies, model, movie_to_idx, idx_to_movie):
    
    ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie]


### Senario 1: Adventure/Fantasy 

In [10]:
list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
               "Harry Potter and the Chamber of Secrets (2002)",
               "Harry Potter and the Prisoner of Azkaban (2004)",
               "Harry Potter and the Goblet of Fire (2005)"]

top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

['Shrek 2 (2004)',
 'Ratatouille (2007)',
 'Ice Age (2002)',
 "Pirates of the Caribbean: Dead Man's Chest (2006)",
 'Harry Potter and the Order of the Phoenix (2007)',
 'Harry Potter and the Deathly Hallows: Part 1 (2010)',
 'Harry Potter and the Half-Blood Prince (2009)',
 'Up (2009)',
 'Spider-Man 2 (2004)',
 'Star Wars: Episode III - Revenge of the Sith (2005)',
 'Iron Man (2008)',
 'X2: X-Men United (2003)',
 'Avatar (2009)',
 '300 (2007)',
 'WALL·E (2008)',
 'Incredibles, The (2004)',
 'I, Robot (2004)',
 'Matrix Revolutions, The (2003)',
 'Avengers, The (2012)',
 'Toy Story 3 (2010)',
 'Star Wars: Episode II - Attack of the Clones (2002)',
 'Juno (2007)',
 'Bourne Supremacy, The (2004)',
 'Chicken Run (2000)',
 'Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)',
 'Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)',
 'Harry Potter and the Deathly Hallows: Part 2 (2011)',
 'Bruce Almighty (2003)',
 'Spirited Away (Sen to Chihiro no kamikakushi) (2001)',
 'La

### Senario 2:  Action/Adventure

In [11]:
list_movies = ["Black Panther (2017)",
               "Avengers, The (2012)",
               "Avengers: Infinity War - Part I (2018)",
               "Logan (2017)",
               "Spider-Man (2002)",
               "Spider-Man 3 (2007)",
               "Spider-Man: Far from Home (2019)"]

top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

['Iron Man (2008)',
 'Guardians of the Galaxy (2014)',
 'Iron Man 2 (2010)',
 'Interstellar (2014)',
 'Guardians of the Galaxy 2 (2017)',
 'Deadpool (2016)',
 'The Martian (2015)',
 'Harry Potter and the Prisoner of Azkaban (2004)',
 'Thor: Ragnarok (2017)',
 'Avatar (2009)',
 'I, Robot (2004)',
 'Rogue One: A Star Wars Story (2016)',
 'X2: X-Men United (2003)',
 "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
 'Ant-Man (2015)',
 'Kingsman: The Secret Service (2015)',
 'Harry Potter and the Chamber of Secrets (2002)',
 'Avengers: Age of Ultron (2015)',
 'Dark Knight, The (2008)',
 'Star Wars: Episode III - Revenge of the Sith (2005)',
 'Star Wars: Episode VII - The Force Awakens (2015)',
 'Captain America: Civil War (2016)',
 "Pirates of the Caribbean: Dead Man's Chest (2006)",
 'Captain America: The Winter Soldier (2014)',
 'Edge of Tomorrow (2014)',
 'Matrix Revolutions, The (2003)',
 'Inception (2010)',
 'Harry Potter and the Order o

### Senario 3: Comedy

In [12]:
list_movies = ["Zootopia (2016)",
               "Toy Story 3 (2010)",
               "Toy Story 4 (2019)",
               "Finding Nemo (2003)",
               "Ratatouille (2007)",
               "The Lego Movie (2014)",
               "Ghostbusters (a.k.a. Ghost Busters) (1984)",
               "Ace Ventura: When Nature Calls (1995)"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
 'Harry Potter and the Prisoner of Azkaban (2004)',
 'Harry Potter and the Goblet of Fire (2005)',
 'Harry Potter and the Deathly Hallows: Part 1 (2010)',
 'Harry Potter and the Half-Blood Prince (2009)',
 'Frozen (2013)',
 'Monsters, Inc. (2001)',
 'Harry Potter and the Chamber of Secrets (2002)',
 'Toy Story (1995)',
 'The Hunger Games (2012)',
 'Big Hero 6 (2014)',
 'Shrek (2001)',
 'Ice Age (2002)',
 'Harry Potter and the Order of the Phoenix (2007)',
 'Incredibles, The (2004)',
 'Toy Story 2 (1999)',
 'Kung Fu Panda (2008)',
 'Inside Out (2015)',
 'Shrek 2 (2004)',
 'Brave (2012)',
 'Wreck-It Ralph (2012)',
 'How to Train Your Dragon 2 (2014)',
 'Harry Potter and the Deathly Hallows: Part 2 (2011)',
 'Megamind (2010)',
 'Lion King, The (1994)',
 'Tangled (2010)',
 'Avatar (2009)',
 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 'Beauty and the Beast (1991)']

This extends beyond the concepts discussed in the blog. We have developed a custom model, which we've named CustomBert. The inspiration for this model stemmed from the attention mechanism. We take the user's prompt as our input, and through the use of BERT's self-attention mechanism, along with a sophisticated tokenizer, we are able to effectively map the user input words to specific genres. We added a dropout layer with 0.3 dropout rate and a linear layer to make decisions for classification. We utilize the overviews in our dataset to match inputs to genres. After training for 10 epochs, we achieved a remarkable accuracy of 89.89%. This demonstrates the efficacy of the BERT model, which requires only 10 epochs to attain excellent performance in predicting the top 3 genres.

In [10]:
from transformers import BertTokenizer
import torch
import torch.nn as nn
import numpy as np
from transformers import BertModel
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, random_split, TensorDataset
from tqdm import tqdm
import json
from recommender.train_utils import MovieClassifier, encode_text, load_data, evaluate_model

import argparse
import os
import glob
csv_file_path = "../data/kaggle_movie/movies_metadata.csv"

In [12]:
device = torch.device("cpu")

In [13]:

print(f"Using device: {device}")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_ids, attention_masks, ids = load_data(csv_file=csv_file_path, tokenizer=tokenizer)

# Encode labels using LabelEncoder and determine number of classes
# label_encoder = LabelEncoder()
# encoded_labels = label_encoder.fit_transform(ids.numpy()) 
print(f"Type of input_ids: {type(input_ids)}, shape: {input_ids.shape}")
print(f"Type of attention_masks: {type(attention_masks)}, shape: {attention_masks.shape}")
# print(f"Type of encoded_labels: {type(encoded_labels)}, shape: {torch.tensor(encoded_labels, dtype=torch.long).shape}")
# if not isinstance(encoded_labels, torch.Tensor):
#     encoded_labels = torch.tensor(encoded_labels, dtype=torch.long)
# else:
#     encoded_labels = encoded_labels.type(torch.long)

# print(f"Type of encoded_labels after conversion/check: {type(encoded_labels)}")
dataset = TensorDataset(input_ids, attention_masks, ids)


train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size

train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])


eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)


Using device: cpu




Manually Decoded Genres Dictionary:  {'Action': 0, 'Adventure': 1, 'Animation': 2, 'Aniplex': 3, 'BROSTA TV': 4, 'Carousel Productions': 5, 'Comedy': 6, 'Crime': 7, 'Documentary': 8, 'Drama': 9, 'Family': 10, 'Fantasy': 11, 'Foreign': 12, 'GoHands': 13, 'History': 14, 'Horror': 15, 'Mardock Scramble Production Committee': 16, 'Music': 17, 'Mystery': 18, 'Odyssey Media': 19, 'Pulser Productions': 20, 'Rogue State': 21, 'Romance': 22, 'Science Fiction': 23, 'Sentai Filmworks': 24, 'TV Movie': 25, 'Telescene Film Group Productions': 26, 'The Cartel': 27, 'Thriller': 28, 'Vision View Entertainment': 29, 'War': 30, 'Western': 31}
Type of input_ids: <class 'torch.Tensor'>, shape: torch.Size([44512, 512])
Type of attention_masks: <class 'torch.Tensor'>, shape: torch.Size([44512, 512])


In [12]:
model_files = 'recommender/customBert_89.pt'
if model_files:
    custom_model = MovieClassifier(num_movies=32).to(device)
    custom_model_dic = torch.load(model_files)
    custom_model.load_state_dict(custom_model_dic)
    print(f"Loaded best model from {model_files}")
custom_model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded best model from recommender/customBert_89.pt


MovieClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [8]:
valid_acc = evaluate_model(custom_model, eval_loader, device=device)
print(f"Validation Accuracy = {valid_acc}%, for top k = 3")

                                                             

Validation Accuracy = 93.04728743120296%, for top k = 3




In [32]:
valid_acc = evaluate_model(custom_model, eval_loader, device=device)
print(f"Validation Accuracy = {valid_acc}%, for top k = 2")

                                                             

Validation Accuracy = 93.21576996518027%, for top k = 2




In [33]:
valid_acc = evaluate_model(custom_model, eval_loader, device=device)
print(f"Validation Accuracy = {valid_acc}%, for top k = 1")

                                                             

Validation Accuracy = 93.31685948556667%, for top k = 1




We can observe that our model obtain a really good score in Validation. Now we will map the genre to movie with our cosine similarity function!

In [60]:
def genres_to_movies(genres_, model_, movies_, embeddings, k=5):
    if not genres_:
        print('Enter at least ONE genre!')
        return
    
    k = 5 if k <= 0 else k
    
    all_genres = ','.join(genres_)
    all_genres_emb = model_.encode(all_genres)
    all_genres_emb = np.array(all_genres_emb)
    
    cos_simi = cosine_similarity([all_genres_emb], embeddings)[0]
    
    top_indices = np.argsort(-cos_simi)[:k]
    recommend_movies = movies_.iloc[top_indices]
    
    print(f'Top {k} recommendations:')
    for index, movie in recommend_movies.iterrows():
        print('{:<5} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))
        
    return recommend_movies['title'].tolist()

In [61]:
import random
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets, evaluation, util
sentence_model = SentenceTransformer('distilbert-base-nli-mean-tokens').to(device)

In [62]:
movies_all_path = '../data/kaggle_movie/movies_with_keywords.csv'
movies = pd.read_csv(movies_all_path, keep_default_na=False, dtype=str)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45429 entries, 0 to 45428
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           45429 non-null  object
 1   title        45429 non-null  object
 2   AllGenres    45429 non-null  object
 3   AllKeywords  45429 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


In [63]:
names_genres_keywords = movies['title'] + ',' + movies['AllGenres'] + ',' + movies['AllKeywords']
names_genres_keywords = names_genres_keywords.tolist()
names_genres_keywords[:5]

['Toy Story,Animation,Comedy,Family,friendship,new toy,rivalry,boy',
 'Jumanji,Adventure,Fantasy,Family,',
 'Grumpier Old Men,Romance,Comedy,fishing,best friend,duringcreditsstinger,old men',
 'Waiting to Exhale,Comedy,Drama,Romance,interracial relationship,based on novel,single mother,chick flick',
 'Father of the Bride Part II,Comedy,baby,aging,mother daughter relationship,gynecologist']

In [64]:
embedding = sentence_model.encode(names_genres_keywords, show_progress_bar=True)
embedding = np.array(embedding)

Batches:   0%|          | 0/1420 [00:00<?, ?it/s]

Now it is time to combine our model together!
The basic idea is use the trained Custom model to map description or called user input to genres and from genres to Movies by Cosine Similarity and encoding sentence model.

In [93]:
#This is for Description to genres mapping!
genre_map = {'Action': 0, 'Adventure': 1, 'Animation': 2, 'Aniplex': 3, 'BROSTA TV': 4, 'Carousel Productions': 5, 'Comedy': 6, 'Crime': 7, 'Documentary': 8, 'Drama': 9, 'Family': 10, 'Fantasy': 11, 'Foreign': 12, 'GoHands': 13, 'History': 14, 'Horror': 15, 'Mardock Scramble Production Committee': 16, 'Music': 17, 'Mystery': 18, 'Odyssey Media': 19, 'Pulser Productions': 20, 'Rogue State': 21, 'Romance': 22, 'Science Fiction': 23, 'Sentai Filmworks': 24, 'TV Movie': 25, 'Telescene Film Group Productions': 26, 'The Cartel': 27, 'Thriller': 28, 'Vision View Entertainment': 29, 'War': 30, 'Western': 31}
index_to_genre = {index: genre for genre, index in genre_map.items()}

my_desciption = "In a high-stakes chase across the globe, a retired spy races against time to thwart a shadowy villain’s plan, navigating explosive encounters and dramatic betrayals with breathtaking stunts and precision."
input_ids, attention_masks = encode_text([my_desciption], tokenizer)
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
output = custom_model.forward(input_ids, attention_masks)


In [94]:
probs = torch.softmax(output, dim=1)
top_k_probs, top_k_preds = torch.topk(probs, 3, dim=1)
top_3_preds = []
tolst = top_k_preds.tolist()

for x in tolst[0]:
    top_3_preds.append(index_to_genre[x])
print(top_3_preds)


['Carousel Productions', 'Drama', 'Action']


In [95]:
#genres to movies mapping!
rec_movies = genres_to_movies(top_3_preds, model_=sentence_model, movies_=movies, embeddings=embedding, k=3)

Top 3 recommendations:
45374. Swing                               Genres: Romance,Drama,Action
16002. Provocateur                         Genres: Action,Drama
20246. Milarepa                            Genres: Action,Adventure,Drama
