In [1]:
import os
import pandas as pd
from qdrant_client import models,QdrantClient
from qdrant_client.http.models import PointStruct, SparseVector
from collections import defaultdict
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Load CSV files
ratings_df = pd.read_csv('data/ratings_small.csv')
movies_df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

# Convert movieId in ratings_df to string
ratings_df['movieId'] = ratings_df['movieId'].astype(str)

# Convert id in movies_df to string and handle missing values
movies_df['id'] = movies_df['id'].astype(str)
movies_df = movies_df.dropna(subset=['id'])
movies_df['title'] = movies_df['title'].fillna('')

# Normalize ratings
ratings_df['rating'] = (ratings_df['rating'] - ratings_df['rating'].mean()) / ratings_df['rating'].std()

# Merge ratings with movie metadata to get movie titles
ratings_df = ratings_df.merge(movies_df[['id', 'title']], left_on='movieId', right_on='id', how='left')

# Aggregate ratings to handle duplicate (userId, title) pairs
ratings_df = ratings_df.groupby(['userId', 'movieId']).rating.mean().reset_index()

In [3]:
qdrant_client = QdrantClient(
    os.getenv("QDRANT_HOST"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# Create a new Qdrant collection
collection_name = "movies"
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={},
    sparse_vectors_config={
        "ratings": models.SparseVectorParams()
    }
)

True

In [4]:
# Convert ratings to sparse vectors
user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
for row in ratings_df.itertuples():
    user_sparse_vectors[row.userId]["values"].append(row.rating)
    user_sparse_vectors[row.userId]["indices"].append(int(row.movieId))

In [5]:
# Define a data generator
def data_generator():
    for user_id, sparse_vector in user_sparse_vectors.items():
        yield PointStruct(
            id=user_id,
            vector={"ratings": SparseVector(
                indices=sparse_vector["indices"],
                values=sparse_vector["values"]
            )},
            payload={"user_id": user_id}
        )

# Upload points using the data generator
qdrant_client.upload_points(
    collection_name=collection_name,
    points=data_generator()
)

# Search 

In [6]:
my_ratings = {
    603: 1,     # Matrix
    13475: 1,   # Star Trek
    11: 1,      # Star Wars
    1091: -1,   # The Thing
    862: 1,     # Toy Story
    597: -1,    # Titanic
    680: -1,    # Pulp Fiction
    13: 1,      # Forrest Gump
    120: 1,     # Lord of the Rings
    87: -1,     # Indiana Jones
    562: -1     # Die Hard
}

In [7]:
indices = list(my_ratings.keys())
values = list(my_ratings.values())
custom_sparse_vector = {"indices": indices, "values": values}

In [8]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import NamedSparseVector, SparseVector

# Create the NamedSparseVector for the query
named_query_vector = NamedSparseVector(
    name="ratings",
    vector=SparseVector(
        indices=custom_sparse_vector["indices"],
        values=custom_sparse_vector["values"]
    )
)

# Perform the search
results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=named_query_vector,
    limit=10
)

# Print search results with movie titles
for result in results:
    movie_id = result.id
    movie_title_row = movies_df.loc[movies_df['id'] == str(movie_id), 'title']
    if not movie_title_row.empty:
        movie_title = movie_title_row.values[0]
        print(f"Title: {movie_title}, Score: {result.score}, Payload: {result.payload}")
    else:
        print(f"Movie ID {movie_id} not found in movies_df.")

Movie ID 518 not found in movies_df.
Movie ID 57 not found in movies_df.
Title: Pirates of the Caribbean: At World's End, Score: 2.404021, Payload: {'user_id': 285}
Title: Magnetic Rose, Score: 2.404021, Payload: {'user_id': 30}
Title: The Horse Whisperer, Score: 2.404021, Payload: {'user_id': 547}
Movie ID 373 not found in movies_df.
Title: Citizen Kane, Score: 1.972675, Payload: {'user_id': 15}
Title: Bang, Boom, Bang, Score: 1.972675, Payload: {'user_id': 344}
Movie ID 353 not found in movies_df.
Movie ID 282 not found in movies_df.
