In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
from transformers import BertTokenizer
import torch
from transformers import Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
import numpy as np
from datetime import datetime

Loading dataset and data pre processing

In [13]:
movies_df = pd.read_csv("movies_with_plot.csv")
roles_df = pd.read_csv("malayalam_movie_cast_dataset.csv")
meta_df  = pd.read_csv("actor_metadata.csv")


#Normalize names
movies_df["movie_name"] = movies_df["movie_name"].str.strip().str.lower()
roles_df["movie_name"]  = roles_df["movie_name"].str.strip().str.lower()
meta_df["actor_name"]   = meta_df["actor_name"].str.strip().str.lower()
roles_df["actor_name"] = roles_df["actor_name"].str.strip().str.lower()


# merge roles + movies
df = pd.merge(
    roles_df,
    movies_df,
    on=["movie_name", "year"],
    how="inner"
)


# merge actor metadata
df = pd.merge(
    df,
    meta_df,
    on="actor_name",
    how="left"
)


# compute age from dob
CURRENT_YEAR = datetime.now().year

df["age"] = df["birth_year"].apply(
    lambda x: CURRENT_YEAR - x if not pd.isna(x) else None
)


def age_group(age):
    if age is None:
        return "unknown"
    elif age < 20:
        return "teen"
    elif age <= 30:
        return "young"
    elif age <= 45:
        return "adult"
    elif age <= 60:
        return "middle"
    else:
        return "senior"
    

df["age_group"] = df["age"].apply(age_group)

# Fill missing
df["gender"] = df["gender"].fillna("unknown")
df["character_name"] = df["character_name"].fillna("unknown")

Building constraint aware input text

In [14]:


df["input_text"] = (
    df["plot"].astype(str)
    + " Character: " + df["character_name"].astype(str)
    + ". Gender: " + df["gender"].astype(str)
    + ". Age group: " + df["age_group"].astype(str)
)




Encoding the input text into embeddings

In [15]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(df["input_text"].tolist(), show_progress_bar=True)
np.save("final_embeddings.npy", embeddings)
# =========================
# 8. Save
# =========================
np.save("castnet_embeddings.npy", embeddings)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 702.72it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 291/291 [03:24<00:00,  1.42it/s]


Suggesting top k actors

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def suggest_actors(user_plot, user_character, user_gender=None, user_age_group=None, top_k=5):
    # 1. Build query text (same structure as training)
    query_text = user_plot + " Character: " + user_character

    if user_gender:
        query_text += f". Gender: {user_gender}"

    if user_age_group:
        query_text += f". Age group: {user_age_group}"

    # 2. Encode query
    query_emb = model.encode([query_text])

    # 3. Similarity
    sims = cosine_similarity(query_emb, embeddings)[0]

    # 4. Top-k indices
    top_idx = np.argsort(sims)[-top_k:][::-1]

    # 5. Return results
    return df.iloc[top_idx][
        ["actor_name", "movie_name", "character_name", "gender", "age_group"]
    ]
