In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
from transformers import BertTokenizer
import torch
from transformers import Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, InputExample, losses
import numpy as np
from datetime import datetime
import random
from torch.utils.data import DataLoader


Loading dataset and data pre processing

In [3]:
movies_df = pd.read_csv("movies_with_plot.csv")
roles_df = pd.read_csv("malayalam_movie_cast_dataset.csv")
meta_df  = pd.read_csv("actor_metadata.csv")


#Normalize names
movies_df["movie_name"] = movies_df["movie_name"].str.strip().str.lower()
roles_df["movie_name"]  = roles_df["movie_name"].str.strip().str.lower()
meta_df["actor_name"]   = meta_df["actor_name"].str.strip().str.lower()
roles_df["actor_name"] = roles_df["actor_name"].str.strip().str.lower()


# merge roles + movies
df = pd.merge(
    roles_df,
    movies_df,
    on=["movie_name", "year"],
    how="inner"
)


# merge actor metadata
df = pd.merge(
    df,
    meta_df,
    on="actor_name",
    how="left"
)


# compute age from dob
CURRENT_YEAR = datetime.now().year

df["age"] = df["birth_year"].apply(
    lambda x: CURRENT_YEAR - x if not pd.isna(x) else None
)


def age_group(age):
    if age is None:
        return "unknown"
    elif age < 20:
        return "teen"
    elif age <= 30:
        return "young"
    elif age <= 45:
        return "adult"
    elif age <= 60:
        return "middle"
    else:
        return "senior"
    

df["age_group"] = df["age"].apply(age_group)

# Fill missing
df["gender"] = df["gender"].fillna("unknown")
df["character_name"] = df["character_name"].fillna("unknown")

Building constraint aware input text

In [5]:


df["input_text"] = (
    df["plot"].astype(str)
    + " Character: " + df["character_name"].astype(str)
    + ". Gender: " + df["gender"].astype(str)
    + ". Age group: " + df["age_group"].astype(str)
)

df = df.dropna(subset=["input_text", "actor_name"])
df = df.sample(3000, random_state=42)



Training the model

In [None]:
train_examples = []

for i in range(len(df)):
    anchor = df.iloc[i]["input_text"]

    same_actor_rows = df[df["actor_name"] == df.iloc[i]["actor_name"]]
    same_actor_rows = same_actor_rows.drop(df.index[i], errors="ignore")

    if len(same_actor_rows) == 0:
        continue

    positive = same_actor_rows.sample(1)["input_text"].values[0]

    neg_candidates = df[df["actor_name"] != df.iloc[i]["actor_name"]]
    negative = neg_candidates.sample(1)["input_text"].values[0]

    train_examples.append(InputExample(texts=[anchor, positive, negative]))

# ðŸš€ Train ONCE, after loop
model = SentenceTransformer("all-MiniLM-L6-v2")
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.TripletLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=50,
    output_path="castnet_finetuned_model_cpu"
)

print("Fine-tuning complete (CPU).")


Encoding the input text into embeddings

In [6]:
model = SentenceTransformer("castnet_finetuned_model_cpu")
embeddings = model.encode(df["input_text"].tolist(), show_progress_bar=True)
np.save("castnet_embeddings.npy", embeddings)


Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 103/103 [00:00<00:00, 865.41it/s, Materializing param=pooler.dense.weight]                             
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 94/94 [01:02<00:00,  1.50it/s]


Suggesting top k actors

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def suggest_actors(user_plot, user_character, user_gender=None, user_age_group=None, top_k=5):
    # 1. Build query text (same structure as training)
    query_text = user_plot + " Character: " + user_character

    if user_gender:
        query_text += f". Gender: {user_gender}"

    if user_age_group:
        query_text += f". Age group: {user_age_group}"

    # 2. Encode query
    query_emb = model.encode([query_text])

    # 3. Similarity
    sims = cosine_similarity(query_emb, embeddings)[0]

    # 4. Top-k indices
    top_idx = np.argsort(sims)[-top_k:][::-1]

    # 5. Return results
    return df.iloc[top_idx][
        ["actor_name", "movie_name", "character_name", "gender", "age_group"]
    ]


In [8]:
print(df["age_group"].unique())


['senior' 'middle' 'adult' 'teen' 'young']
