In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import torch

movies_cleaned = pd.read_csv('data/TMDB_movie_dataset_v11_cleaned.csv')

In [None]:
movies_cleaned.head(5)

In [None]:
movies_cleaned.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Compute TF-IDF matrix
print("Computing TF-IDF matrix")
tfidf_matrix = tfidf.fit_transform(movies_cleaned['description'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (1253612, 488236) is large so it is not recommended to calculate the entire cosine similarity between all pairs of documents.


In [None]:
# # Compute cosine similarity matrix
# from sklearn.metrics.pairwise import linear_kernel
#
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# def get_recommendations(movie_title: str, cosine_sim: cosine_sim, top_n: int = 5) -> pd.Series:
#     """
#     Get top-N movie recommendations based on cosine similarity for a given movie title.
#
#     Args:
#         movie_title (str): Title of the movie to find recommendations for.
#         cosine_sim (np.ndarray): Cosine similarity matrix.
#         top_n (int): Number of recommendations to return (default: 5).
#
#     Returns:
#         pd.Series: Titles of the top-N recommended movies.
#     """
#     # Get movie index
#     idx = indices[movie_title]
#
#     # Get similarity scores for the movie
#     sim_scores = list(enumerate(cosine_sim[idx]))
#
#     # Sort by similarity score in descending order
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#
#     # Select top-N similar movies (excluding the input movie)
#     sim_scores = sim_scores[1:top_n + 1]
#     print(f"Top {top_n} recommendations for '{movie_title}': {sim_scores}")
#
#     # Get movie indices
#     movie_indices = [i[0] for i in sim_scores]
#     print(f"Movie indices for recommendations: {movie_indices}")
#
#     # Return recommended movie titles
#     return movies_cleaned['title'].iloc[movie_indices]

In [None]:
# Create index mapping for movie titles
indices_series = pd.Series(movies_cleaned.index, index=movies_cleaned['title'])
print(f"Indices mapping: {indices_series.head(20)}")

In [None]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --force-reinstall --user => Cuda
# pip install ipywidgets

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
# Encode description use Sentence-BERT
movie_description_embeddings = model.encode(
    movies_cleaned['description'].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

# Ensure embeddings are float32 and C-contiguous
movie_description_embeddings = np.array(movie_description_embeddings, dtype=np.float32)
if not movie_description_embeddings.flags.c_contiguous:
    movie_description_embeddings = np.ascontiguousarray(movie_description_embeddings)

# FAISS index
embedding_dim = movie_description_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)

# Normalize vectors
faiss.normalize_L2(movie_description_embeddings)
index.add(movie_description_embeddings)
print(f"Added {index.ntotal} vectors to FAISS index")

# Save index and embeddings
faiss.write_index(index, "data/movie_faiss.index")
np.save("data/movie_embeddings.npy", movie_description_embeddings)

In [None]:
index = faiss.read_index("data/movie_faiss.index")

In [None]:
def get_recommendations(movie_title, top_n=5):
    movie_row = movies_cleaned[movies_cleaned['title'] == movie_title]
    if movie_row.empty:
        raise ValueError(f"Movie '{movie_title}' not found in dataset.")

    query_desc = movie_row['description'].values[0]
    if not isinstance(query_desc, str) or not query_desc.strip():
        raise ValueError(f"Invalid description for '{movie_title}'.")

    torch.cuda.empty_cache()  # Optional

    query_vec = model.encode([query_desc], convert_to_numpy=True)
    query_embedding = np.array(query_vec, dtype=np.float32)

    if query_embedding.ndim == 1:
        query_embedding = query_embedding.reshape(1, -1)
    if not query_embedding.flags.c_contiguous:
        query_embedding = np.ascontiguousarray(query_embedding)

    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_n + 1)
    result_indices = indices[0][1:top_n + 1]

    return movies_cleaned.iloc[result_indices]['title'].tolist()


In [None]:
recommended_movies = get_recommendations(movie_title='The Avengers', top_n=5)
print("Recommended Movies:", recommended_movies)