In [1]:
# 02 Content Score
# Build content representations for movies: a Story (semantic) channel and a Creator (metadata) channel.
# Produces artifacts:
# - story_embeddings.npy (n_movies x D)  or story_tfidf.npz + vectorizer (fallback)
# - creators_tfidf.npz + vectorizer
# - movies_with_content_meta.csv (contains content-related columns)

In [2]:
import os
import re
import joblib
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# ---------- CONFIG ----------
IN_CSV = "movies_with_numeric_score.csv"    # output of numeric score notebook
OUT_META = "movies_with_content_meta.csv"
STORY_EMB_NPY = "story_embeddings.npy"      # sentence-embeddings
STORY_TFIDF_VEC = "story_tfidf_vectorizer.joblib"
STORY_TFIDF_NPZ = "story_tfidf.npz"
CREATORS_VECTORIZER = "creators_tfidf.joblib"
CREATORS_MATRIX = "creators_tfidf.npz"

In [4]:
# Parameters
TOP_CAST = 5            # keep only top-k cast members
DIRECTOR_BOOST = 8      # repetition multiplier for director tokens
CAST_BOOST = 3          # repetition multiplier for cast tokens
WRITERS_BOOST = 3
PRODUCERS_BOOST = 2

# TF-IDF params for creators channel
CREATORS_TFIDF_PARAMS = {
    'max_features': 15000,
    'ngram_range': (1,1),
    'stop_words': 'english',
    'dtype': np.float32,
    'norm': 'l2',
    'min_df': 3
}

# TF-IDF params for fallback story channel (if SBERT unavailable)
STORY_TFIDF_PARAMS = {
    'max_features': 20000,
    'ngram_range': (1,2),
    'stop_words': 'english',
    'dtype': np.float32,
    'norm': 'l2',
    'min_df': 2
}

In [5]:
# ---------- Helpers for tokenization (same style used earlier) ----------
SEP_PATTERN = r'[,\|;/]+'
WHITESPACE_RE = re.compile(r'\s+')
UNKNOWN_TOKEN = 'unknown'


In [6]:
def normalize_token(s):
    s = str(s).strip().lower()
    s = re.sub(r'\(.*?\)', '', s)
    s = re.sub(r'[^0-9a-zA-Z]+', ' ', s)
    s = WHITESPACE_RE.sub(' ', s).strip()
    s = s.replace(' ', '_')
    if not s or s == 'unknown':
        return None
    return s

In [7]:
def split_and_prefix(field_name, cell_value, max_items=None):
    if pd.isna(cell_value) or str(cell_value).strip() == '':
        return []
    parts = re.split(SEP_PATTERN, str(cell_value))
    if max_items is not None:
        parts = parts[:max_items]
    tokens = []
    for p in parts:
        norm = normalize_token(p)
        if norm is None:
            continue
        tokens.append(f"{field_name}_{norm}")
    return tokens

In [8]:
# ---------- Load dataframe ----------
if not os.path.exists(IN_CSV):
    raise FileNotFoundError(f"Input not found: {IN_CSV}. Run numeric score notebook first.")

df = pd.read_csv(IN_CSV)
print(f"Loaded {IN_CSV} with {len(df)} rows")


Loaded movies_with_numeric_score.csv with 29937 rows


In [9]:
# Ensure important columns exist
for col in ['overview','tagline','genres','cast','director','writers','producers','production_companies','spoken_languages']:
    if col not in df.columns:
        df[col] = ''

In [10]:
# ---------- Build story_text column (overview + tagline) ----------
print("Building story_text (overview + tagline)...")
df['story_text'] = df['overview'].fillna('') + ' ' + df['tagline'].fillna('')
# small cleanup
df['story_text'] = df['story_text'].astype(str).str.replace('\n',' ').str.strip()

# ---------- Try Sentence-BERT for story embeddings (recommended) ----------
use_sbert = True
story_embeddings = None
try:
    from sentence_transformers import SentenceTransformer
    print("SentenceTransformer available — building story embeddings (SBERT)...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    story_embeddings = model.encode(df['story_text'].tolist(), show_progress_bar=True, convert_to_numpy=True)
    np.save(STORY_EMB_NPY, story_embeddings)
    print(f"Saved story embeddings to {STORY_EMB_NPY} (shape={story_embeddings.shape})")
except Exception as e:
    use_sbert = False
    print("SBERT not available or failed — will fallback to TF-IDF for story channel.", e)

if not use_sbert:
    # Fit TF-IDF on story text and save
    print("Fitting TF-IDF on story_text (fallback)...")
    story_tfidf = TfidfVectorizer(**STORY_TFIDF_PARAMS)
    story_matrix = story_tfidf.fit_transform(df['story_text'].astype(str).values)
    joblib.dump(story_tfidf, STORY_TFIDF_VEC)
    sparse.save_npz(STORY_TFIDF_NPZ, story_matrix)
    print(f"Saved story TF-IDF -> {STORY_TFIDF_VEC}, {STORY_TFIDF_NPZ}")

# ---------- Build creators/content tokens (weighted) ----------
print("Building creators content tokens (director, cast, writers, producers, genres, companies)...")
creator_texts = []
for row in df.itertuples(index=False):
    tokens = []
    # director (keep 1, strong boost)
    tokens += split_and_prefix('director', getattr(row, 'director', ''), max_items=1) * DIRECTOR_BOOST
    # director_of_photography
    tokens += split_and_prefix('director_of_photography', getattr(row, 'director_of_photography', ''), max_items=1) * 3
    # cast (top-k)
    tokens += split_and_prefix('cast', getattr(row, 'cast', ''), max_items=TOP_CAST) * CAST_BOOST
    # writers and producers
    tokens += split_and_prefix('writers', getattr(row, 'writers', ''), max_items=3) * WRITERS_BOOST
    tokens += split_and_prefix('producers', getattr(row, 'producers', ''), max_items=3) * PRODUCERS_BOOST
    # genres and companies and countries and languages
    tokens += split_and_prefix('genres', getattr(row, 'genres', ''))
    tokens += split_and_prefix('production_companies', getattr(row, 'production_companies', ''))
    tokens += split_and_prefix('production_countries', getattr(row, 'production_countries', ''))
    tokens += split_and_prefix('spoken_languages', getattr(row, 'spoken_languages', ''))
    creator_texts.append(' '.join(tokens))

# attach to df
df['creators_content'] = creator_texts

Building story_text (overview + tagline)...
SBERT not available or failed — will fallback to TF-IDF for story channel. No module named 'sentence_transformers'
Fitting TF-IDF on story_text (fallback)...
Saved story TF-IDF -> story_tfidf_vectorizer.joblib, story_tfidf.npz
Building creators content tokens (director, cast, writers, producers, genres, companies)...


In [11]:
# Fit TF-IDF on creators_content
print("Fitting TF-IDF on creators_content...")
creators_tfidf = TfidfVectorizer(**CREATORS_TFIDF_PARAMS)
creators_matrix = creators_tfidf.fit_transform(df['creators_content'].values)
joblib.dump(creators_tfidf, CREATORS_VECTORIZER)
sparse.save_npz(CREATORS_MATRIX, creators_matrix)
print(f"Saved creators tfidf -> {CREATORS_VECTORIZER}, {CREATORS_MATRIX}; shape={creators_matrix.shape}")

Fitting TF-IDF on creators_content...
Saved creators tfidf -> creators_tfidf.joblib, creators_tfidf.npz; shape=(29937, 15000)


In [12]:
# ---------- Save augmented metadata ----------
df_out = df.copy()
df_out.to_csv(OUT_META, index=False)
print(f"Saved metadata with content columns to {OUT_META}")


Saved metadata with content columns to movies_with_content_meta.csv


In [13]:
# ---------- Helper functions (for use in final notebook) ----------

def compute_story_sim_to_user(selected_indices=None, selected_genres=None):
    """Return story similarity vector (0-1) for all movies given user-selected indices.
       Uses SBERT if available else story TF-IDF.
    """
    if selected_indices is None or len(selected_indices)==0:
        raise ValueError("Provide list of selected movie indices")

    if use_sbert and story_embeddings is not None:
        # average embeddings
        user_vec = np.mean(story_embeddings[selected_indices], axis=0, keepdims=True)
        sims = (story_embeddings @ user_vec.T).flatten()  # cosine-like if embeddings normalized
        # if embeddings not normalized, compute cosine
        # normalize
        sims = (sims - sims.min()) / (sims.max() - sims.min() + 1e-12)
        return sims
    else:
        # load story_tfidf and matrix if not in memory
        if 'story_matrix' not in globals():
            story_tfidf = joblib.load(STORY_TFIDF_VEC)
            story_matrix = sparse.load_npz(STORY_TFIDF_NPZ)
        else:
            story_tfidf = globals().get('story_tfidf')
            story_matrix = globals().get('story_matrix')
        # average rows
        sel = story_matrix[selected_indices]
        user_vec = sel.mean(axis=0)
        if not sparse.issparse(user_vec):
            user_vec = sparse.csr_matrix(user_vec)
        sims = story_matrix.dot(user_vec.T).A.flatten()
        sims = (sims - sims.min()) / (sims.max() - sims.min() + 1e-12)
        return sims


In [14]:
def compute_creators_sim_to_user(selected_indices=None, selected_genres=None):
    if selected_indices is None or len(selected_indices)==0:
        raise ValueError("Provide list of selected movie indices")
    # average creators_matrix rows
    if 'creators_matrix' not in globals():
        creators_tfidf = joblib.load(CREATORS_VECTORIZER)
        creators_matrix = sparse.load_npz(CREATORS_MATRIX)
    else:
        creators_matrix = globals().get('creators_matrix')
    sel = creators_matrix[selected_indices]
    user_vec = sel.mean(axis=0)
    if not sparse.issparse(user_vec):
        user_vec = sparse.csr_matrix(user_vec)
    sims = creators_matrix.dot(user_vec.T).A.flatten()
    sims = (sims - sims.min()) / (sims.max() - sims.min() + 1e-12)
    return sims

print('\nContent score artifacts ready. Use compute_story_sim_to_user() and compute_creators_sim_to_user() in the final notebook to produce hybrid content similarity.')



Content score artifacts ready. Use compute_story_sim_to_user() and compute_creators_sim_to_user() in the final notebook to produce hybrid content similarity.
