In [39]:
# pip install -U sentence-transformers

In [40]:
import numpy as np
import pandas as pd
import os
import kagglehub
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import euclidean_distances

In [41]:
# --- 1. LOAD DATA ---
path = kagglehub.dataset_download("aayushsoni4/tmdb-6000-movie-dataset-with-ratings")
movies_data = pd.read_csv(path + "/tmdb_6000_movie_dataset.csv")
ratings_data = pd.read_csv(path + "/tmdb_6000_movie_ratings.csv")

Using Colab cache for faster access to the 'tmdb-6000-movie-dataset-with-ratings' dataset.


In [42]:
movies_data[['tmdbId', 'title', 'overview']].head(3)

Unnamed: 0,tmdbId,title,overview
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


In [43]:
movies_data.head()

Unnamed: 0.1,Unnamed: 0,budget,genres,homepage,tmdbId,keywords,original_language,original_title,overview,popularity,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{'id': 849, 'name': 'dc comics'}, {'id': 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{'id': 818, 'name': 'based on novel'}, {'id':...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [44]:
ratings_data.head()

Unnamed: 0,tmdbId,userId,rating,timestamp
0,19995,10.0,3.5,2015-05-03 15:28:22
1,19995,14.0,3.5,2011-07-24 18:47:52
2,19995,22.0,2.5,2019-10-14 07:22:58
3,19995,29.0,4.0,2019-02-05 20:38:13
4,19995,40.0,3.0,2016-01-06 00:50:54


In [45]:
# --- 2. PREPARE USER PROFILES ---
high_ratings = ratings_data[ratings_data['rating'] >= 4]
merged_df = high_ratings.merge(movies_data[['tmdbId', 'title']], on='tmdbId')

user_profiles = merged_df.groupby('userId').agg({
    'tmdbId': list,
    'title': list
}).reset_index()

user_profiles['userId'] = user_profiles['userId'].astype(int)

user_profiles.columns = ['userId', 'liked_movie_ids', 'liked_movie_titles']

In [46]:
high_ratings

Unnamed: 0,tmdbId,userId,rating,timestamp
3,19995,29.0,4.0,2019-02-05 20:38:13
8,19995,69.0,4.5,2010-02-15 16:09:23
9,19995,72.0,4.0,2010-03-16 15:12:50
10,19995,86.0,4.0,2017-04-17 20:49:16
12,19995,112.0,4.0,2016-10-26 06:52:37
...,...,...,...,...
24537607,626332,217563.0,4.0,2023-07-15 23:10:39
24537608,626332,227275.0,4.0,2023-06-10 14:15:04
24537610,626332,296414.0,4.0,2023-06-23 15:22:55
24537616,842544,122552.0,4.0,2023-02-14 15:36:52


In [47]:
user_profiles

Unnamed: 0,userId,liked_movie_ids,liked_movie_titles
0,1,"[1858, 671, 22, 585, 120, 98, 122, 121, 9806, ...","[Transformers, Harry Potter and the Philosophe..."
1,2,"[280, 36955, 9268, 1572, 8587, 197, 568, 9350,...","[Terminator 2: Judgment Day, True Lies, Eraser..."
2,3,"[767, 10681, 14160, 150540, 177572, 10191, 157...","[Harry Potter and the Half-Blood Prince, WALL·..."
3,4,"[767, 14160, 150540, 157336, 27205, 674, 2062,...","[Harry Potter and the Half-Blood Prince, Up, I..."
4,5,"[857, 603, 550, 745, 334, 10219, 9366, 807, 42...","[Saving Private Ryan, The Matrix, Fight Club, ..."
...,...,...,...
320279,330971,"[120, 122, 121, 9806, 568, 1900, 423, 1892, 28...",[The Lord of the Rings: The Fellowship of the ...
320280,330972,"[6520, 568, 9087, 710, 13, 8839, 10731, 11858,...","[First Knight, Apollo 13, The American Preside..."
320281,330973,"[155, 27205, 6964, 550, 8488, 14635, 10878, 77]","[The Dark Knight, Inception, Something's Gotta..."
320282,330974,"[38757, 58, 155, 1726, 1771, 1895, 120, 98, 18...","[Tangled, Pirates of the Caribbean: Dead Man's..."


In [48]:
# --- 3. CREATE VECTORS (THE "MODEL") ---
model = SentenceTransformer('all-MiniLM-L6-v2')
movies_data['soup'] = movies_data['title'] + " " + movies_data['overview'].fillna('')

print("Generating movie vectors...")
movie_embeddings = model.encode(movies_data['soup'].tolist(), show_progress_bar=True, normalize_embeddings=True)
movie_id_to_vector = dict(zip(movies_data['tmdbId'], movie_embeddings))

Generating movie vectors...


Batches:   0%|          | 0/182 [00:00<?, ?it/s]

In [49]:
movies_data['soup'].head()

Unnamed: 0,soup
0,"Avatar In the 22nd century, a paraplegic Marin..."
1,Pirates of the Caribbean: At World's End Capta...
2,Spectre A cryptic message from Bond’s past sen...
3,The Dark Knight Rises Following the death of D...
4,"John Carter John Carter is a war-weary, former..."


In [50]:
# --- 4. DEFINE RECOMMENDATION ENGINE ---
def get_recommendations(user_id, user_profiles_df, movies_df, vector_map):
    user_row = user_profiles_df[user_profiles_df['userId'] == user_id]
    if user_row.empty: return "User not found."

    liked_ids = user_row['liked_movie_ids'].values[0]
    liked_vectors = [vector_map[mid] for mid in liked_ids if mid in vector_map]

    if not liked_vectors: return "No vectors found."

    user_vector = np.mean(liked_vectors, axis=0).reshape(1, -1)

    all_vectors = np.array(list(vector_map.values()))
    all_ids = list(vector_map.keys())

    dists = euclidean_distances(user_vector, all_vectors).flatten()

    results = pd.DataFrame({'tmdbId': all_ids, 'distance': dists})
    results = results.sort_values('distance')

    recommendations = results[~results['tmdbId'].isin(liked_ids)].head(10)

    return recommendations.merge(movies_df[['tmdbId', 'title', 'overview']], on='tmdbId')

In [51]:
# # --- 5. TEST THE SYSTEM ---
# target_user = 33009
# recs = get_recommendations(target_user, user_profiles, movies_data, movie_id_to_vector)

# print(f"--- Recommendations for User {target_user} ---")
# print(recs[['title', 'distance']])

In [52]:
target_user = 9933

# 1. Retrieve and Inspect User's History (The "Input")
user_row = user_profiles[user_profiles['userId'] == target_user]
liked_ids = user_row['liked_movie_ids'].values[0]

# Filter the original movies_data to get details about what they liked
liked_movies_info = movies_data[movies_data['tmdbId'].isin(liked_ids)]

print(f"\n=== TRACE: User {target_user} Profile ===")
print(f"User has liked {len(liked_ids)} movies.")
print("--- Top 5 Liked Movies (Basis for User Vector) ---")
# displaying the 'overview' is key because that's what the embeddings are built from
print(liked_movies_info[['title', 'overview']].head(5).to_string(index=False))

# 2. Generate and Inspect Recommendations (The "Output")
recs = get_recommendations(target_user, user_profiles, movies_data, movie_id_to_vector)

print(f"\n=== TRACE: Recommendations ===")
print("--- Top 5 Recommended Movies ---")
print(recs[['title', 'distance', 'overview']].head(5).to_string(index=False))


=== TRACE: User 9933 Profile ===
User has liked 6 movies.
--- Top 5 Liked Movies (Basis for User Vector) ---
                  title                                                                                                                                                                                                                                                                                                overview
      The Fifth Element                                                                                                                                                       In 2257, a taxi driver is unintentionally given the task of saving a young girl who is part of the key that will ensure the survival of humanity.
             Fight Club A ticking-time-bomb insomniac and a slippery soap salesman channel primal male aggression into a shocking new form of therapy. Their concept catches on, with underground "fight clubs" forming in every town, until an eccentric 

In [53]:
liked_ids

[18, 550, 85, 240, 13820, 289]

In [54]:
# # --- 6. SAVE THE SYSTEM (OPTIONAL) ---
# # This saves the "Map" (Vectors) so you don't have to wait next time.
# with open('movie_vectors.pkl', 'wb') as f:
#     pickle.dump(movie_id_to_vector, f)

# print("System Saved! You can find 'movie_vectors.pkl' in your files.")

In [55]:
# # 1. THE RED BOX: Filter the entire ratings table for high ratings (e.g. >= 3.5)
# high_ratings = ratings_data[ratings_data['rating'] >= 3.5]

# # 2. MERGE: Add movie titles so we see names instead of just ID numbers
# merged_df = high_ratings.merge(movies_data[['tmdbId', 'title']], on='tmdbId')

# # 3. GROUP BY: Compress the data so each user has one row with a list of items
# user_profiles = merged_df.groupby('userId').agg({
#     'tmdbId': list,     # Collect IDs into a list like [19995, 200, 300]
#     'title': list       # Collect Titles into a list like ['Avatar', 'Star Trek']
# }).reset_index()

# user_profiles['userId'] = user_profiles['userId'].astype(int)

# # 4. RESULT
# # print(user_profiles)

# # Print only UserID and Titles side-by-side
# print(user_profiles[['userId', 'title']].head(10))

In [56]:
# from sentence_transformers import SentenceTransformer

# # 1. Load the AI Model
# # 'all-MiniLM-L6-v2' is the industry standard for speed/performance balance
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # 2. Prepare the Text (The "Soup" strategy is best)
# # We combine Title + Overview + Keywords for a richer vector
# movies_data['soup'] = movies_data['title'] + " " + movies_data['overview'].fillna('')

# # 3. Create the Vectors (This might take 1-2 minutes for 6k movies)
# print("Generating movie vectors...")
# movie_embeddings = model.encode(movies_data['soup'].tolist(), show_progress_bar=True, normalize_embeddings=True)

# # 4. Create a Lookup Dictionary (ID -> Vector)
# # This makes looking up "Avatar's vector" extremely fast
# movie_id_to_vector = dict(zip(movies_data['tmdbId'], movie_embeddings))

# print("Vector database ready.")

In [57]:
# from sklearn.metrics.pairwise import euclidean_distances  # <--- THIS IS THE FIX

# def get_recommendations(user_id, user_profiles_df, movies_df, vector_map):
#     # A. Get User History
#     user_row = user_profiles_df[user_profiles_df['userId'] == user_id]
#     if user_row.empty: return "User not found."

#     # B. Get Vectors
#     liked_ids = user_row['liked_movie_ids'].values[0]
#     liked_vectors = [vector_map[mid] for mid in liked_ids if mid in vector_map]

#     if not liked_vectors: return "No vectors found."

#     # C. Average Vector (User Profile)
#     user_vector = np.mean(liked_vectors, axis=0).reshape(1, -1)

#     # D. Euclidean Distance
#     all_vectors = np.array(list(vector_map.values()))
#     all_ids = list(vector_map.keys())

#     # The math happens here
#     dists = euclidean_distances(user_vector, all_vectors).flatten()

#     # E. Formatting
#     results = pd.DataFrame({'tmdbId': all_ids, 'distance': dists})
#     results = results.sort_values('distance')

#     # Filter out seen movies
#     recommendations = results[~results['tmdbId'].isin(liked_ids)].head(10)
#     return recommendations.merge(movies_df[['tmdbId', 'title', 'overview']], on='tmdbId')

# # Run it
# # print(get_recommendations(3, user_profiles, movies_data, movie_id_to_vector))

# # 1. SAVE the recommendations into a variable named 'recs'
# target_user = 16  # You were looking at user 3
# recs = get_recommendations(target_user, user_profiles, movies_data, movie_id_to_vector)

# # 2. PRINT the simple list (what you had before)
# print(f"--- Recommendations for User {target_user} ---")
# print(recs[['title', 'distance']])

# # 3. PRINT THE "WHY" (The new part)
# print(f"\n\n=== USER {target_user} LIKED THESE MOVIE PLOTS ===")
# user_liked_ids = user_profiles[user_profiles['userId'] == target_user]['liked_movie_ids'].values[0]
# user_liked_movies = movies_data[movies_data['tmdbId'].isin(user_liked_ids)]

# for i, row in user_liked_movies.iterrows():
#     print(f"Movie: {row['title']}")
#     print(f"Plot:  {str(row['overview'])[:150]}...") # Added str() to handle missing text safely
#     print("-" * 30)

# print(f"\n\n=== BECAUSE OF THAT, THE AI RECOMMENDS ===")
# for i, row in recs.head(3).iterrows():
#     print(f"Movie: {row['title']} (Distance: {row['distance']:.4f})")
#     print(f"Plot:  {str(row['overview'])[:150]}...")
#     print("-" * 30)