In [None]:
%load_ext dotenv
%dotenv

In [None]:
import os
from opensearchpy import Field, Boolean, Float, Integer, Document, Keyword, Text, DenseVector, Nested, Date, Object
from opensearchpy import OpenSearch
import numpy as np
import pandas as pd
import datetime

from tqdm import tqdm

In [None]:
header = ['userId', 'movieId', 'rating', 'timestamp']
df_movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=['id', 'name', 'fecha', 'x', 'url'] + list(range(19)) , encoding='latin-1')
df_users = pd.read_csv('data/ml-100k/u.user', sep='|',  names=['id', 'age', 'ocupation', 'x'], encoding='latin-1')

You need to run the keras_recommentadion_system notebook to generate the vectors

In [None]:
movie_embeddings_matrix = np.load('data/vectors/movie_embeddings_matrix.npy')
user_embeddings_matrix = np.load('data/vectors/user_embeddings_matrix.npy')
user2Idx = np.load('data/vectors/user2Idx.npy', allow_pickle=True).item()
movie2Idx = np.load('data/vectors/movie2Idx.npy', allow_pickle=True).item()

In [None]:
df_users['userIdx'] = df_users['id'].apply(lambda x: user2Idx[x])
df_movies['movieIdx'] = df_movies['id'].apply(lambda x: movie2Idx[x])

In [None]:
df_movies.loc[df_movies['url'].isna(), 'url'] = ''

In [None]:
host = 'localhost'
port = 9200
auth = ('admin', os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD"))

In [None]:
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_show_warn=False,
)
client.cluster.health()

In [None]:
# Define the KNNVector field where the vectors are going to be stored
class KNNVector(Field):
    name = "knn_vector"
    def __init__(self, dimension, method, **kwargs):
        super(KNNVector, self).__init__(dimension=dimension, method=method, **kwargs)

method = {
    "name": "hnsw",
    "space_type": "cosinesimil",
    "engine": "nmslib"
}

In [None]:
movie_embeddings_matrix.shape[1]

In [None]:
# Define the Movie class
index_name = 'movie'
class Movie(Document):
    movie_id = Keyword()
    url = Keyword()
    name = Text()
    created_at = Date()
    terror = Boolean()

    vector = KNNVector(
        movie_embeddings_matrix.shape[1],
        method
    )
    class Index:
        name = index_name
        settings = {
                'index': {
                'knn': True
            }
        }
    # Redefine the save method to assign the movie_id as index instead of a custom index
    # This approach will prevent from having duplicated movies
    def save(self, ** kwargs):
        self.meta.id = self.movie_id
        return super(Movie, self).save(** kwargs)

In [None]:
Movie.init(using=client)

In [None]:
client.indices.exists('movie')

In [None]:
client.indices.get('movie')

In [None]:
# Load vectors to the db
for i, row in tqdm(df_movies.iterrows(), total=df_movies.shape[0]):
    mv = Movie(
        movie_id = row.id,
        url = row.url,
        name = row['name'],
        terror = row[0],
        vector = list(movie_embeddings_matrix[row.movieIdx]),
        creared_at = datetime.datetime.now()
    )
    mv.save(using=client)
# Get the total number of loaded registries
print(f"Total movies in db: {Movie.search(using=client).count()}")

In [None]:
movie_embeddings_matrix[5]

In [None]:
# Execute a query
movie_idx_to_search = 5

df_movies[df_movies['movieIdx'] == movie_idx_to_search]

movie_embeddings_matrix[movie_idx_to_search]

query = {
    "size": 5,
    "query": {
        "knn": {
        "vector": {
            "vector": movie_embeddings_matrix[movie_idx_to_search],
            "k" : 20
        }
        }
    }
}

response = client.search(index='movie', body=query)

for h in response['hits']['hits']:
    print(h)