In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
from transformers import BertTokenizer
import torch
from transformers import Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
import numpy as np

In [22]:
roles_df = pd.read_csv("roles_clean.csv")
movies_df = pd.read_csv("movies_clean.csv")

df = roles_df.merge(movies_df, on="movie_name", how="inner")

print(df.head())

           movie_name  year_x          actor_name  character_name  year_y  \
0  Sufi Paranja Katha    2010  Sharbani Mukherjee  Karthi, Suhara    2010   
1  Sufi Paranja Katha    2010       Thampi Antony     Sanku Menon    2010   
2  Sufi Paranja Katha    2010        Prakash Bare        Mamootty    2010   
3  Sufi Paranja Katha    2010   Jagathy Sreekumar  Avaru Musaliar    2010   
4  Sufi Paranja Katha    2010     V. K. Sreeraman    Saidu Mullah    2010   

                                                plot  
0  This film is a narrative by Sufi, a Muslim sch...  
1  This film is a narrative by Sufi, a Muslim sch...  
2  This film is a narrative by Sufi, a Muslim sch...  
3  This film is a narrative by Sufi, a Muslim sch...  
4  This film is a narrative by Sufi, a Muslim sch...  


In [23]:
df["input_text"] = df["plot"] + " " + df["character_name"]

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(df["input_text"].tolist(), show_progress_bar=True)

np.save("plot_embeddings.npy", embeddings)
df.to_csv("final_dataset_with_text.csv", index=False)

joblib.dump(model, "sbert_model.pkl")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 839.23it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 291/291 [03:15<00:00,  1.49it/s]


['sbert_model.pkl']

In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

df = pd.read_csv("final_dataset_with_text.csv")
embeddings = np.load("plot_embeddings.npy")

model = SentenceTransformer("all-MiniLM-L6-v2")

def suggest_actors(user_plot, top_k=5):
    query = model.encode([user_plot])
    scores = cosine_similarity(query, embeddings)[0]

    top_idx = scores.argsort()[-top_k:][::-1]

    return df.iloc[top_idx][["actor_name", "movie_name", "character_name"]]


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 411.63it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
