In [9]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
tmdb_tmdb_movie_metadata_path = kagglehub.dataset_download('tmdb/tmdb-movie-metadata')

print('Data source import complete.')


Using Colab cache for faster access to the 'tmdb-movie-metadata' dataset.
Data source import complete.


In [10]:
import warnings
warnings.filterwarnings("ignore")


# ------------------ 1) Import libraries ------------------

In [11]:
import time, ast, re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize


# ------------------ 2) User options ------------------


In [12]:
MOVIES_CSV = '/content/sample_data/tmdb_5000_movies.csv'
CREDITS_CSV = '/content/sample_data/tmdb_5000_credits.csv'

# Optional settings
USE_KMEANS = True           # Whether to use clustering for boosting diversity
KMEANS_CLUSTERS = 12        # Number of clusters
TFIDF_MAX_FEATURES = 5000   # Max features for TF-IDF vectorizer
TOP_CAST = 3                 # Number of top cast members to include
TOP_GENRES = 2               # Number of top genres to include
TOP_KEYWORDS = 3             # Number of top keywords to include

# Timer to track total build time
start_all = time.time()



# ------------------ 3) Load & merge data ------------------

In [13]:
# Load movies and credits CSV files
movies = pd.read_csv(MOVIES_CSV, low_memory=False)
credits = pd.read_csv(CREDITS_CSV, engine='python')

# Clean column names (strip extra spaces)
movies.columns = movies.columns.str.strip()
credits.columns = credits.columns.str.strip()

# Ensure ID columns are integers
movies['id'] = movies['id'].astype(int)
credits['movie_id'] = credits['movie_id'].astype(int)

# Merge movies and credits on movie ID
df = movies.merge(credits, left_on='id', right_on='movie_id', how='inner')

# Rename title columns in case multiple exist after merge
if 'title_x' in df.columns and 'title' not in df.columns:
    df = df.rename(columns={'title_x': 'title'})
if 'title_y' in df.columns and 'title' not in df.columns:
    df = df.rename(columns={'title_y': 'title'})

# Ensure title column exists
if 'title' not in df.columns:
    raise ValueError("After merging, 'title' column not found.")

print(f"Loaded {len(df)} movies. Columns sample: {list(df.columns[:12])}")

Loaded 4803 movies. Columns sample: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date']



# ------------------ 4) Helper functions ------------------

In [14]:
# Safely evaluate stringified lists/dictionaries
def safe_eval(x):
    try:
        return ast.literal_eval(x) if pd.notna(x) else []
    except:
        return []

# Clean names: lowercase and remove spaces
def clean_name(s):
    return re.sub(r'\s+', '', s.lower()) if isinstance(s, str) else ''

# Get top N items from a list of dicts (e.g., top genres or cast)
def get_top_items(list_of_dicts, key='name', top_n=3):
    if not isinstance(list_of_dicts, (list, tuple)):
        return []
    out = []
    for i, d in enumerate(list_of_dicts):
        if i >= top_n:
            break
        if isinstance(d, dict) and d.get(key):
            out.append(clean_name(d.get(key)))
    return out

# Extract the director from crew list
def extract_director(crew_list):
    if not isinstance(crew_list, (list, tuple)):
        return ''
    for member in crew_list:
        if isinstance(member, dict) and member.get('job') == 'Director':
            return clean_name(member.get('name'))
    return ''


# ------------------ 5) Preprocess columns ------------------

In [15]:
# Parse JSON-like columns into lists
for col in ['genres', 'keywords', 'cast', 'crew']:
    if col in df.columns:
        df[col] = df[col].apply(safe_eval)
    else:
        df[col] = [[] for _ in range(len(df))]

# Clean overview text
if 'overview' not in df.columns:
    df['overview'] = ''
df['overview'] = df['overview'].fillna('').astype(str).str.lower().str.replace(r'[^a-z0-9\s]', ' ', regex=True)

# Extract top features for soup
df['top_genres'] = df['genres'].apply(lambda x: get_top_items(x, top_n=TOP_GENRES))
df['top_keywords'] = df['keywords'].apply(lambda x: get_top_items(x, top_n=TOP_KEYWORDS))
df['top_cast'] = df['cast'].apply(lambda x: get_top_items(x, top_n=TOP_CAST))
df['director'] = df['crew'].apply(extract_director)

# Build the "soup" for TF-IDF vectorization
def make_soup(row):
    parts = []
    parts.extend(row.get('top_genres', []))
    parts.extend(row.get('top_keywords', []))
    parts.extend(row.get('top_cast', []))
    if row.get('director'):
        parts.append(row['director'])
    overview_short = ' '.join(str(row.get('overview','')).split()[:200])
    parts.append(overview_short)
    return ' '.join(parts).strip()

df['soup'] = df.apply(make_soup, axis=1)

# Keep only rows with a valid title
df = df[df['title'].notna()].reset_index(drop=True)
print(f"Preprocessing done. Total movies: {len(df)}")

Preprocessing done. Total movies: 4803



# ------------------ 6) TF-IDF vectorization & cosine similarity ------------------

In [16]:
tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['soup'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# ------------------ 7) Optional KMeans clustering ------------------

In [17]:
kmeans = None
if USE_KMEANS:
    X = normalize(tfidf_matrix)   # Normalize before clustering
    kmeans = KMeans(n_clusters=KMEANS_CLUSTERS, random_state=42, n_init=10)
    kmeans.fit(X)
    df['cluster'] = kmeans.labels_
    print(f"KMeans clustering done (k={KMEANS_CLUSTERS})")
else:
    df['cluster'] = -1

KMeans clustering done (k=12)



# ------------------ 8) Recommendation functions ------------------

In [18]:
# Map movie titles to indices
indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

# Fuzzy title search
def find_title_fuzzy(query):
    q = str(query).strip().lower()
    if q in indices.index:
        return int(indices[q])
    mask = df['title'].str.lower().str.contains(q)
    if mask.any():
        return int(df[mask].index[0])
    return None

# Recommend movies based on a single title
def recommend_by_title(title, top_n=8, use_cluster_boost=True, boost_factor=0.15):
    idx = find_title_fuzzy(title)
    if idx is None:
        return f"Title '{title}' not found (try a different spelling)."
    sims = list(enumerate(cosine_sim[idx]))
    sim_df = pd.DataFrame(sims, columns=['idx','sim'])
    if use_cluster_boost and 'cluster' in df.columns:
        target_cluster = int(df.loc[idx, 'cluster'])
        sim_df = sim_df.merge(df[['cluster']], left_on='idx', right_index=True)
        sim_df['sim'] += boost_factor * (sim_df['cluster']==target_cluster).astype(float)
    sim_df = sim_df[sim_df['idx'] != idx]
    sim_df = sim_df.sort_values('sim', ascending=False).head(top_n)
    out = df.loc[sim_df['idx'], ['title','overview','release_date','vote_average','vote_count']].copy()
    out['score'] = sim_df['sim'].values
    return out.reset_index(drop=True)

# Recommend movies based on a list of liked movies
def recommend_for_profile(liked_titles, top_n=10):
    idxs = [find_title_fuzzy(t) for t in liked_titles if find_title_fuzzy(t) is not None]
    if len(idxs)==0:
        return "No liked titles found in database."
    user_vec = tfidf_matrix[idxs].mean(axis=0)
    sims = linear_kernel(user_vec, tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][:top_n]
    out = df.loc[top_idx, ['title','overview','release_date','vote_average','vote_count']].copy()
    out['score'] = sims[top_idx]
    return out.reset_index(drop=True)


# ------------------ 9) Ready message ------------------

In [19]:
print(f"\nReady! Model built in {time.time()-start_all:.2f}s.")
print("Use functions like:\n- recommend_by_title('Movie Name', top_n=5)\n- recommend_for_profile(['Movie A','Movie B'], top_n=10)")



Ready! Model built in 10.56s.
Use functions like:
- recommend_by_title('Movie Name', top_n=5)
- recommend_for_profile(['Movie A','Movie B'], top_n=10)


# ------------------ 10) Quick test ------------------

In [21]:

try:
    #sample_title = df['title'].iloc[0]
    sample_title = 'Home Alone'
    print(f"\nSample title: {sample_title}")
    display(recommend_by_title(sample_title, top_n=5)[['title','vote_average','score']])
except Exception as e:
    print("Sample run error:", e)



Sample title: Home Alone


Unnamed: 0,title,vote_average,score
0,Home Alone 2: Lost in New York,6.3,0.471142
1,Christmas with the Kranks,5.2,0.351067
2,Time Bandits,6.6,0.34835
3,Minions,6.4,0.287763
4,A Charlie Brown Christmas,7.5,0.278955
