In [None]:
# import psutil

# def monitor_resource_usage():
#     cpu_percent = psutil.cpu_percent()
#     virtual_memory = psutil.virtual_memory()
    
#     print(f"CPU Usage: {cpu_percent}%")
#     print(f"Memory Usage: {virtual_memory.used / (1024 ** 3):.2f} GB used out of {virtual_memory.total / (1024 ** 3):.2f} GB ({virtual_memory.percent}%)")

# monitor_resource_usage()

In [None]:
import psutil

def monitor_resource_usage():
    cpu_percent = psutil.cpu_percent()
    virtual_memory = psutil.virtual_memory()

    # Check if 'cached' attribute is available
    if hasattr(virtual_memory, 'cached'):
        physical_memory_percent = (virtual_memory.used - virtual_memory.cached) / virtual_memory.total * 100
    else:
        # Alternative calculation if 'cached' is not available
        physical_memory_percent = virtual_memory.used / virtual_memory.total * 100

    print(f"CPU Usage: {cpu_percent}%")
    print(f"Physical Memory Usage: {physical_memory_percent:.2f}%")

monitor_resource_usage()


In [None]:
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import BertTokenizer, BertModel
import torch
import faiss 

### Data is downloaded from: https://grouplens.org/datasets/movielens/https://grouplens.org/datasets/movielens/

In [None]:
monitor_resource_usage()

In [None]:
nRowsRead = 500 # specify 'None' if want to read whole file
# movie_metadata.csv has 5044 rows in reality, but we are only loading/previewing the first 1000 rows
movies = pd.read_csv('../data/movie-lens-small/movies.csv', delimiter=',', nrows = nRowsRead)
ratings = pd.read_csv('../data/movie-lens-small/ratings.csv', delimiter=',', nrows = nRowsRead)
tags = pd.read_csv('../data/movie-lens-small/tags.csv', delimiter=',', nrows = nRowsRead)

In [None]:
movies.shape

Let's take a quick look at what the data looks like:

In [None]:
pd.set_option('display.max_columns', None)

display(movies.head(5))
# display(ratings.head(5))
# display(tags.head(5))


In [None]:
monitor_resource_usage()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoder = BertModel.from_pretrained('bert-base-uncased')

def encode_titles_batch(titles, batch_size=32):
    all_embeddings = []
    
    for i in range(0, len(titles), batch_size):
        batch = titles[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
        outputs = encoder(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        all_embeddings.append(embeddings)

        print(f"Processed batch {i // batch_size + 1}/{len(titles) // batch_size + 1}")

    # Concatenate all batches
    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

In [None]:
monitor_resource_usage()

In [None]:
# Assuming movies is a DataFrame with 'title' and 'movieId'
batch_size = 128  # Adjust batch size as needed

# Process titles in batches and create embeddings
embeddings = encode_titles_batch(movies['title'].tolist(), batch_size=batch_size)


In [None]:
embeddings.shape

In [None]:
def create_faiss_index(embeddings):    
    d = embeddings.shape[1]  # dimension of embeddings
    n = embeddings.shape[0]
    print("Initiatize index")
    print(f'number of records to index: {n}')
    # index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT)
    
    ######## test ############
    # Assuming 'd' is the dimension of your data
    quantizer = faiss.IndexFlatIP(d)
    nlist = 10  # You can adjust this value as needed
    
    index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)

    print("Training the index...")
    index.train(embeddings)  # Train the index before adding data
    print("Training completed.")
    
    ######## test ############
    
    print("Adding embeddings to the index...")
    index.add(embeddings)
    print("Embeddings added to the index.")

    return index
# Create FAISS index with these embeddings
faiss_index = create_faiss_index(embeddings)

In [None]:
monitor_resource_usage()

In [None]:
# Store a mapping of index to movie_id
index_to_id = {i: (row['movieId'], row['title']) for i, row in movies[['movieId', 'title']].iterrows()}

def encode_query(query, tokenizer, encoder):
    print(f"DEBUG: encoding query: {query}...")
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=128)
    outputs = encoder(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    print(f"DEBUG: done encoding query: {query}")

    return embeddings
    
def search_index(query, k, faiss_index, index_to_id, tokenizer, encoder):
    query_embedding = encode_query(query, tokenizer, encoder)
    print(f"DEBUG: calling faiss_index with {k}...")
    print(f"DEBUG: query_embedding dimension...")
    print(query_embedding.shape)
    distances, indices = faiss_index.search(query_embedding, k)
    print(f"DEBUG: done calling faiss_index with {k}!")

    # Retrieve movie IDs for the indices
    return [(index_to_id[idx], distances[0][i]) for i, idx in enumerate(indices[0])]



In [None]:
monitor_resource_usage()


In [None]:
search_index('toy story', 5, faiss_index, index_to_id, tokenizer, encoder)

In [None]:
path_prefix = "../model-artifacts"

In [None]:
import pickle

# Assuming index_to_id is your dictionary
with open(f"{path_prefix}/index_to_id.pickle", 'wb') as handle:
    pickle.dump(index_to_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
faiss.write_index(faiss_index, f'{path_prefix}/faiss_index.idx')

In [None]:
# Save the tokenizer and model
encoder.save_pretrained(f"{path_prefix}/bert_model")
tokenizer.save_pretrained(f"{path_prefix}/bert_model")

## load model and test

In [None]:
import os # accessing directory structure
from transformers import BertTokenizer, BertModel
import faiss 
import torch
import numpy as np

In [None]:
path_prefix = "../model-artifacts"

In [None]:
monitor_resource_usage()

In [None]:
import pickle

faiss_index = faiss.read_index(f"{path_prefix}/faiss_index.idx")
tokenizer = BertTokenizer.from_pretrained(f"{path_prefix}/bert_model")
encoder = BertModel.from_pretrained(f"{path_prefix}/bert_model")

In [None]:
with open(f"{path_prefix}/index_to_id.pickle", 'rb') as handle:
    index_to_id = pickle.load(handle)

In [None]:
monitor_resource_usage()

In [None]:
def encode_query(query, tokenizer, encoder):
    print(f"DEBUG: encoding query: {query}...")
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=128)
    outputs = encoder(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    print(f"DEBUG: done encoding query: {query}")

    return embeddings
    
def search_index(query, k, faiss_index, index_to_id, tokenizer, encoder):
    query_embedding = encode_query(query, tokenizer, encoder)
    print(f"DEBUG: calling faiss_index with {k}...")
    print(f"DEBUG: query_embedding dimension...")
    print(query_embedding.shape)
    distances, indices = faiss_index.search(query_embedding, k)
    print(f"DEBUG: done calling faiss_index with {k}!")

    # Retrieve movie IDs for the indices
    return [(index_to_id[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

In [None]:
search_index('toy', 5, faiss_index, index_to_id, tokenizer, encoder)

In [None]:
monitor_resource_usage()