In [1]:
import os
import pandas as pd
import requests
from IPython.display import display, HTML
from qdrant_client import models,QdrantClient
from qdrant_client.http.models import PointStruct, SparseVector, NamedSparseVector
from collections import defaultdict
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Load CSV files
ratings_df = pd.read_csv('data/ratings_small.csv')
movies_df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

# Convert movieId in ratings_df to string
ratings_df['movieId'] = ratings_df['movieId'].astype(str)

# Convert id and imdb_id in movies_df to string and handle missing values
movies_df['id'] = movies_df['id'].astype(str)
movies_df['imdb_id'] = movies_df['imdb_id'].astype(str).str.extract(r'(tt\d+)').fillna('')
movies_df = movies_df.dropna(subset=['id', 'imdb_id'])

# Normalize ratings
ratings_df['rating'] = (ratings_df['rating'] - ratings_df['rating'].mean()) / ratings_df['rating'].std()

# Merge ratings with movie metadata to get movie titles and imdb_id
merged_df = ratings_df.merge(movies_df[['id', 'title', 'imdb_id']], left_on='movieId', right_on='id', how='inner')

# Aggregate ratings to handle duplicate (userId, title) pairs
ratings_agg_df = merged_df.groupby(['userId', 'movieId']).rating.mean().reset_index()

In [3]:
qdrant_client = QdrantClient(
    os.getenv("QDRANT_HOST"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# Create a new Qdrant collection
collection_name = "movies"
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={},
    sparse_vectors_config={
        "ratings": models.SparseVectorParams()
    }
)

True

In [4]:
# Convert ratings to sparse vectors
user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
for row in ratings_agg_df.itertuples():
    user_sparse_vectors[row.userId]["values"].append(row.rating)
    user_sparse_vectors[row.userId]["indices"].append(int(row.movieId))

In [5]:
# Define a data generator
def data_generator():
    for user_id, sparse_vector in user_sparse_vectors.items():
        yield PointStruct(
            id=user_id,
            vector={"ratings": SparseVector(
                indices=sparse_vector["indices"],
                values=sparse_vector["values"]
            )},
            payload={"user_id": user_id}
        )

# Upload points using the data generator
qdrant_client.upload_points(
    collection_name=collection_name,
    points=data_generator()
)

# Search 

In [6]:
my_ratings = {
    603: 1,     # Matrix
    13475: 1,   # Star Trek
    11: 1,      # Star Wars
    1091: -1,   # The Thing
    862: 1,     # Toy Story
    597: -1,    # Titanic
    680: -1,    # Pulp Fiction
    13: 1,      # Forrest Gump
    120: 1,     # Lord of the Rings
    87: -1,     # Indiana Jones
    562: -1     # Die Hard
}

In [7]:
# Create sparse vector from my_ratings
def to_vector(ratings):
    vector = SparseVector(
        values=[],
        indices=[]
    )
    for movie_id, rating in ratings.items():
        vector.values.append(rating)
        vector.indices.append(movie_id)
    return vector

In [8]:
# Function to get movie poster using OMDB API
def get_movie_poster(imdb_id, api_key):
    url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('Poster', 'No Poster Found'), data
    return 'No Poster Found'

In [9]:
# Perform the search
results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=NamedSparseVector(
        name="ratings",
        vector=to_vector(my_ratings)
    ),
    limit=20
)

# OMDB API Key
omdb_api_key = os.getenv("OMDB_API_KEY")

In [10]:
# Create HTML to display results
html_content = "<div class='movies-container'>"

# Print search results with movie titles and posters
for result in results:
    movie_id = result.id
    movie_title_row = merged_df.loc[merged_df['id'] == str(movie_id), 'title']
    imdb_id_row = merged_df.loc[merged_df['id'] == str(movie_id), 'imdb_id']
    
    if not movie_title_row.empty and not imdb_id_row.empty:
        movie_title = movie_title_row.values[0]
        imdb_id = imdb_id_row.values[0]
    else:
        imdb_id_row = movies_df.loc[movies_df['id'] == str(movie_id), 'imdb_id']
        if not imdb_id_row.empty:
            imdb_id = imdb_id_row.values[0]
            poster_url, movie_info = get_movie_poster(imdb_id, omdb_api_key)
            movie_title = movie_info.get('Title', 'Unknown Title')
        else:
            continue  # Skip if movie_id not found in both merged_df and movies_df
    
    poster_url, movie_info = get_movie_poster(imdb_id, omdb_api_key)
    html_content += f"""
    <div class='movie-card'>
        <img src="{poster_url}" alt="Poster" class="movie-poster">
        <div class="movie-title">{movie_title}</div>
        <div class="movie-score">Score: {result.score}</div>
    </div>
    """

html_content += "</div>"

display(HTML(html_content))