In [1]:
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import BertTokenizer, BertModel
import faiss 
import torch

### Data is downloaded from: https://grouplens.org/datasets/movielens/https://grouplens.org/datasets/movielens/

In [2]:
nRowsRead = None # specify 'None' if want to read whole file
# movie_metadata.csv has 5044 rows in reality, but we are only loading/previewing the first 1000 rows
movies = pd.read_csv('../data/movie-lens-small/movies.csv', delimiter=',', nrows = nRowsRead)
ratings = pd.read_csv('../data/movie-lens-small/ratings.csv', delimiter=',', nrows = nRowsRead)
tags = pd.read_csv('../data/movie-lens-small/tags.csv', delimiter=',', nrows = nRowsRead)

In [3]:
movies.shape

(9742, 3)

Let's take a quick look at what the data looks like:

In [4]:
pd.set_option('display.max_columns', None)

display(movies.head(5))
# display(ratings.head(5))
# display(tags.head(5))


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoder = BertModel.from_pretrained('bert-base-uncased')

def encode_titles_batch(titles, batch_size=32):
    all_embeddings = []
    
    for i in range(0, len(titles), batch_size):
        batch = titles[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
        outputs = encoder(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        all_embeddings.append(embeddings)

        print(f"Processed batch {i // batch_size + 1}/{len(titles) // batch_size + 1}")

    # Concatenate all batches
    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

In [6]:
# Assuming movies is a DataFrame with 'title' and 'movieId'
batch_size = 512  # Adjust batch size as needed

# Process titles in batches and create embeddings
embeddings = encode_titles_batch(movies['title'].tolist(), batch_size=batch_size)

Processed batch 1/20
Processed batch 2/20
Processed batch 3/20
Processed batch 4/20
Processed batch 5/20
Processed batch 6/20
Processed batch 7/20
Processed batch 8/20
Processed batch 9/20
Processed batch 10/20
Processed batch 11/20
Processed batch 12/20
Processed batch 13/20
Processed batch 14/20
Processed batch 15/20
Processed batch 16/20
Processed batch 17/20
Processed batch 18/20
Processed batch 19/20
Processed batch 20/20


In [7]:
embeddings.shape

(9742, 768)

In [8]:
def create_faiss_index(embeddings):    
    d = embeddings.shape[1]  # dimension of embeddings
    n = embeddings.shape[0]
    print("Initiatize index")
    print(f'number of records to index: {n}')
    index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT)
    
    print("Adding embeddings to the index...")
    index.add(embeddings)
    print("Embeddings added to the index.")

    return index
# Create FAISS index with these embeddings
faiss_index = create_faiss_index(embeddings)

Initiatize index
number of records to index: 9742
Adding embeddings to the index...
Embeddings added to the index.


In [9]:
# Store a mapping of index to movie_id
index_to_id = {i: (row['movieId'], row['title']) for i, row in movies[['movieId', 'title']].iterrows()}

In [10]:
# def search_index(query, k):
#     query_embedding = encode_titles_batch([query])    
#     distances, indices = faiss_index.search(query_embedding, k)
    
#     # Retrieve movie IDs for the indices
#     return [(index_to_id[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
def search_index(query, k, faiss_index, index_to_id, tokenizer, encoder):
    query_embedding = encode_query(query, tokenizer, encoder)
    print(f"DEBUG: calling faiss_index with {k}...")
    print(f"DEBUG: query_embedding dimension...")
    print(query_embedding.shape)
    distances, indices = faiss_index.search(query_embedding, k)
    print(f"DEBUG: done calling faiss_index with {k}!")

    # Retrieve movie IDs for the indices
    return [(index_to_id[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

def encode_query(query, tokenizer, encoder):
    print(f"DEBUG: encoding query: {query}...")
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=128)
    outputs = encoder(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    print(f"DEBUG: done encoding query: {query}")

    return embeddings

In [None]:
# search_index('toy story', 5, faiss_index, index_to_id, tokenizer, encoder)

DEBUG: encoding query: toy story...


# load model and test

In [None]:
path_prefix = "../model-artifacts"

In [None]:
import pickle

# Assuming index_to_id is your dictionary
with open(f"{path_prefix}/index_to_id.pickle", 'wb') as handle:
    pickle.dump(index_to_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
faiss.write_index(faiss_index, f'{path_prefix}/faiss_index.idx')

In [None]:
# Save the tokenizer and model
encoder.save_pretrained(f"{path_prefix}/bert_model")
tokenizer.save_pretrained(f"{path_prefix}/bert_model")

In [None]:
import os # accessing directory structure
from transformers import BertTokenizer, BertModel
import faiss 
import torch
import numpy as np

In [None]:
path_prefix = "../model-artifacts"

In [None]:
import pickle

faiss_index = faiss.read_index(f"{path_prefix}/faiss_index.idx")
tokenizer = BertTokenizer.from_pretrained(f"{path_prefix}/bert_model")
encoder = BertModel.from_pretrained(f"{path_prefix}/bert_model")

In [None]:
with open('index_to_id.pickle', 'rb') as handle:
    index_to_id = pickle.load(handle)

In [None]:
def search_index(query, k):
    query_embedding = encode_titles_batch([query])    
    distances, indices = faiss_index.search(query_embedding, k)
    
    # Retrieve movie IDs for the indices
    return [(index_to_id[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

def encode_titles_batch(titles, batch_size=32):
    all_embeddings = []
    
    for i in range(0, len(titles), batch_size):
        batch = titles[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
        outputs = encoder(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        all_embeddings.append(embeddings)

        print(f"Processed batch {i // batch_size + 1}/{len(titles) // batch_size + 1}")

    # Concatenate all batches
    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

In [None]:
search_index('toy', 5)