### Colums
1. id - Unique identifier for each movie. (type: int)
2. title - Title of the movie. (type: str)
3. vote_average - Average vote or rating given by viewers. (type: float)
4. vote_count - Total count of votes received for the movie. (type: int)
5. status - The status of the movie (e.g., Released, Rumored, Post Production, etc.). (type: str)
6. release_date - Date when the movie was released. (type: str)
7. revenue - Total revenue generated by the movie. (type: int)
8. runtime - Duration of the movie in minutes. (type: int)
9. adult - Indicates if the movie is suitable only for adult audiences. (type: bool)
10. backdrop_path - URL of the backdrop image for the movie. (type: str)
11. budget - Budget allocated for the movie. (type: int)
12. homepage - Official homepage URL of the movie. (type: str)
13. imdb_id - IMDb ID of the movie. (type: str)
14. original_language - Original language in which the movie was produced. (type: str)
15. original_title - Original title of the movie. (type: str)
16. overview - Brief description or summary of the movie. (type: str)
17. popularity - Popularity score of the movie. (type: float)
18. poster_path - URL of the movie poster image. (type: str)
19. tagline - Catchphrase or memorable line associated with the movie. (type: str)
20. genres - List of genres the movie belongs to. (type: str)
21. production_companies - List of production companies involved in the movie. (type: str)
22. production_countries - List of countries involved in the movie production. (type: str)
23. spoken_languages - List of languages spoken in the movie. (type: str)
24. keywords - Keywords associated with the movie. Do `.split(", ")` to convert to a list. (type: str)


In [10]:
import pandas as pd

In [11]:
movies = pd.read_csv('data/TMDB_movie_dataset_v11.csv')
movies.head(5)

KeyboardInterrupt: 

In [14]:
movies.shape

(1262991, 24)

In [15]:
# select columns for recommendation
selected_columns= [
    'id', 'title', 'vote_average', 'vote_count', 'status', 'overview', 'popularity', 'poster_path', 'genres', 'production_companies', 'production_countries', 'tagline', 'keywords'
]
movies = movies[selected_columns]

In [16]:
movies.isnull().sum()

id                            0
title                        13
vote_average                  0
vote_count                    0
status                        0
overview                 274050
popularity                    0
poster_path              422511
genres                   531177
production_companies     708057
production_countries     586118
tagline                 1086202
keywords                 935898
dtype: int64

In [17]:
movies = movies.dropna(subset=['title'])
# ~ movies.dropna(subset=['title'], inplace=True)

In [18]:
# Check duplicate rows
duplicate_rows = movies[movies.duplicated()]
print(f"Row duplicated: {len(duplicate_rows)}")

Row duplicated: 379


In [19]:
movies = movies.drop_duplicates()

duplicate_rows = movies[movies.duplicated()]
print(f"Row duplicated after remove: {len(duplicate_rows)}")

Row duplicated after remove: 0


In [20]:
# Unique values in status column
print("\nUnique values status and appearances:")
print(movies['status'].value_counts())


Unique values status and appearances:
status
Released           1226515
In Production        15362
Post Production      11076
Planned               8816
Rumored                505
Canceled               325
Name: count, dtype: int64


In [21]:
# Remove Canceled and Rumoured movies
movies = movies[~movies['status'].isin(['Rumored', 'Canceled'])]

In [22]:
# Unique values in status column
print("\nUnique values status and appearances:")
print(movies['status'].value_counts())


Unique values status and appearances:
status
Released           1226515
In Production        15362
Post Production      11076
Planned               8816
Name: count, dtype: int64


In [23]:
# Check row has status is not "Released"
non_released_movies = movies[~movies['status'].isin(['Released'])]
print(f"\nTotal rows has status is not Released: {len(non_released_movies)}")

# Condition to drop rows where status is not "Released" and overview is empty
drop_condition = (~movies['status'].isin(['Released'])) & (movies['overview'].isnull())
print(f"Rows have status is not Released and overview is empty: {drop_condition.sum()}")

movies = movies[~drop_condition]
print(f'Length after drop: {len(movies)}')



Total rows has status is not Released: 35254
Rows have status is not Released and overview is empty: 8157
Length after drop: 1253612


In [24]:
# Replace missing poster_path by default URL
default_poster_url = 'https://drive.google.com/file/d/1PEm5a-91wkP4EybCkQRghJqDiTR1P5TC/view?usp=sharing'
movies['poster_path'] = movies['poster_path'].fillna(default_poster_url)

In [25]:
movies.isnull().sum()

id                            0
title                         0
vote_average                  0
vote_count                    0
status                        0
overview                 265483
popularity                    0
poster_path                   0
genres                   528000
production_companies     703509
production_countries     582873
tagline                 1077107
keywords                 928190
dtype: int64

In [26]:
movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')
movies['keywords'] = movies['keywords'].fillna('')
movies['genres'] = movies['genres'].fillna('')
movies['production_companies'] = movies['production_companies'].fillna('')
movies['production_countries'] = movies['production_countries'].fillna('')

movies['description'] = 'Overview: ' + movies['overview'] + ' Tagline: ' + movies['tagline'] + ' Keywords: ' + movies['keywords'] + ' Genres: ' + movies['genres'] + ' Production Companies: ' + movies['production_companies'] + ' Production Countries: ' + movies['production_countries']
print(movies[['description']].head())

KeyboardInterrupt: 

In [16]:
movies['description'] = movies['description'].str.strip().replace(r'\s+', ' ', regex=True)
print(movies[['description']].head())

                                         description
0  Overview: Cobb, a skilled thief who commits co...
1  Overview: The adventures of a group of explore...
2  Overview: Batman raises the stakes in his war ...
3  Overview: In the 22nd century, a paraplegic Ma...
4  Overview: When an unexpected enemy emerges and...


In [17]:
print((movies['description'] == '').sum())

0


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)


True

In [19]:
# Define lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Hàm xử lý từng mô tả
def preprocess_description(text):
    text = text.lower()
    tokens = word_tokenize(text)

    # remove stop words và lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens if  word not in stop_words]
    # lemmatized = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]

    return ' '.join(lemmatized)

movies['description'] = movies['description'].apply(preprocess_description)


In [20]:
movies.to_csv('data/TMDB_movie_dataset_v11_cleaned.csv', index=False)

In [9]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import torch

movies_cleaned = pd.read_csv('data/TMDB_movie_dataset_v11_cleaned.csv')

In [2]:
movies_cleaned.head(5)

Unnamed: 0,id,title,vote_average,vote_count,status,overview,popularity,poster_path,genres,production_companies,production_countries,tagline,keywords,description
0,27205,Inception,8.364,34495,Released,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America",Your mind is the scene of the crime.,"rescue, mission, dream, airplane, paris, franc...","overview : cobb , skilled thief commits corpor..."
1,157336,Interstellar,8.417,32571,Released,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",Mankind was born on Earth. It was never meant ...,"rescue, future, spacecraft, race against time,...",overview : adventure group explorer make use n...
2,155,The Dark Knight,8.512,30619,Released,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America",Welcome to a world without rules.,"joker, sadism, chaos, secret identity, crime f...",overview : batman raise stake war crime . help...
3,19995,Avatar,7.573,29815,Released,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom",Enter the world of Pandora.,"future, society, culture clash, space travel, ...","overview : 22nd century , paraplegic marine di..."
4,24428,The Avengers,7.71,29166,Released,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,Some assembly required.,"new york city, superhero, shield, based on com...",overview : unexpected enemy emerges threatens ...


In [3]:
movies_cleaned.shape

(1253612, 14)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Compute TF-IDF matrix
print("Computing TF-IDF matrix")
tfidf_matrix = tfidf.fit_transform(movies_cleaned['description'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

Computing TF-IDF matrix
TF-IDF matrix shape: (1253612, 488236)


TF-IDF matrix shape: (1253612, 488236) is large so it is not recommended to calculate the entire cosine similarity between all pairs of documents.


In [25]:
# # Compute cosine similarity matrix
# from sklearn.metrics.pairwise import linear_kernel
#
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [26]:
# def get_recommendations(movie_title: str, cosine_sim: cosine_sim, top_n: int = 5) -> pd.Series:
#     """
#     Get top-N movie recommendations based on cosine similarity for a given movie title.
#
#     Args:
#         movie_title (str): Title of the movie to find recommendations for.
#         cosine_sim (np.ndarray): Cosine similarity matrix.
#         top_n (int): Number of recommendations to return (default: 5).
#
#     Returns:
#         pd.Series: Titles of the top-N recommended movies.
#     """
#     # Get movie index
#     idx = indices[movie_title]
#
#     # Get similarity scores for the movie
#     sim_scores = list(enumerate(cosine_sim[idx]))
#
#     # Sort by similarity score in descending order
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#
#     # Select top-N similar movies (excluding the input movie)
#     sim_scores = sim_scores[1:top_n + 1]
#     print(f"Top {top_n} recommendations for '{movie_title}': {sim_scores}")
#
#     # Get movie indices
#     movie_indices = [i[0] for i in sim_scores]
#     print(f"Movie indices for recommendations: {movie_indices}")
#
#     # Return recommended movie titles
#     return movies_cleaned['title'].iloc[movie_indices]

In [4]:
# Create index mapping for movie titles
indices_series = pd.Series(movies_cleaned.index, index=movies_cleaned['title'])
print(f"Indices mapping: {indices_series.head(20)}")

Indices mapping: title
Inception                                             0
Interstellar                                          1
The Dark Knight                                       2
Avatar                                                3
The Avengers                                          4
Deadpool                                              5
Avengers: Infinity War                                6
Fight Club                                            7
Guardians of the Galaxy                               8
Pulp Fiction                                          9
Forrest Gump                                         10
Harry Potter and the Philosopher's Stone             11
Iron Man                                             12
Django Unchained                                     13
The Shawshank Redemption                             14
Avengers: Endgame                                    15
The Matrix                                           16
Titanic                  

In [5]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --force-reinstall --user => Cuda
# pip install ipywidgets

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")

PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA version: 11.8
GPU device: NVIDIA GeForce MX450


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [7]:
# Encode description use Sentence-BERT
movie_description_embeddings = model.encode(
    movies_cleaned['description'].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

# Ensure embeddings are float32 and C-contiguous
movie_description_embeddings = np.array(movie_description_embeddings, dtype=np.float32)
if not movie_description_embeddings.flags.c_contiguous:
    movie_description_embeddings = np.ascontiguousarray(movie_description_embeddings)

# FAISS index
embedding_dim = movie_description_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)

# Normalize vectors
faiss.normalize_L2(movie_description_embeddings)
index.add(movie_description_embeddings)
print(f"Added {index.ntotal} vectors to FAISS index")

# Save index and embeddings
faiss.write_index(index, "data/movie_faiss.index")
np.save("data/movie_embeddings.npy", movie_description_embeddings)



Batches:   0%|          | 0/19588 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Added 1253612 vectors to FAISS index


In [6]:
index = faiss.read_index("data/movie_faiss.index")

In [7]:
def get_recommendations(movie_title, top_n=5):
    movie_row = movies_cleaned[movies_cleaned['title'] == movie_title]
    if movie_row.empty:
        raise ValueError(f"Movie '{movie_title}' not found in dataset.")

    query_desc = movie_row['description'].values[0]
    if not isinstance(query_desc, str) or not query_desc.strip():
        raise ValueError(f"Invalid description for '{movie_title}'.")

    torch.cuda.empty_cache()  # Optional

    query_vec = model.encode([query_desc], convert_to_numpy=True)
    query_embedding = np.array(query_vec, dtype=np.float32)

    if query_embedding.ndim == 1:
        query_embedding = query_embedding.reshape(1, -1)
    if not query_embedding.flags.c_contiguous:
        query_embedding = np.ascontiguousarray(query_embedding)

    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_n + 1)
    result_indices = indices[0][1:top_n + 1]

    return movies_cleaned.iloc[result_indices]['title'].tolist()


In [9]:
recommended_movies = get_recommendations(movie_title='The Avengers', top_n=5)
print("Recommended Movies:", recommended_movies)

  return forward_call(*args, **kwargs)


Recommended Movies: ['Captain America: The Winter Soldier', 'Marvel Rising: Secret Warriors', 'Avengers: Age of Ultron', 'Avengers: Infinity War', 'The Last Avenger']


In [13]:
recommended_movies_2 = get_recommendations(movie_title='Titanic', top_n=5)
print("Recommended Movies:", recommended_movies_2)

  return forward_call(*args, **kwargs)


Recommended Movies: ['Titanica', 'Britannic', 'Titanic: The Captain of the Titanic', "Titanic: A Tale of Two Journeys'", 'A Night to Remember']
