In [19]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to(device)

def get_embeddings(text):
    return model.encode(text)

len(get_embeddings('"A trailer is burning in the middle of a plain. The bodies of two adulterous lovers are found. Scenes from both families, before and after the dramatic events, suggest an unusual connection between them. But what is their secret?'))

384

In [13]:
df = pd.read_csv('clean/movies.csv')

tqdm.pandas()

df['overview'] = df['overview'].astype(str)
df['embeddings'] = df['overview'].progress_apply(get_embeddings)

  df = pd.read_csv('clean/movies.csv')
100%|██████████| 45466/45466 [13:19<00:00, 56.87it/s]


In [15]:
df

Unnamed: 0,adult,budget,id,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,embeddings
0,False,30000000,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373554033.0,81.0,Released,,Toy Story,7.7,5415.0,"[0.063439, 0.0010268698, 0.093210146, -0.01494..."
1,False,65000000,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"[0.08630579, 0.04461486, -0.040496368, -0.0525..."
2,False,0,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,"[-0.10087598, 0.037441824, -0.00092460023, -0...."
3,False,16000000,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,"[-0.055419028, -0.014511978, 0.03143248, 0.042..."
4,False,0,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,"[-0.031386092, -0.06930615, 0.064619444, 0.024..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,0,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,,0.0,90.0,Released,Rising and falling between a man and woman,Subdue,4.0,1.0,"[-0.013269588, -0.04402164, -5.973119e-05, 0.0..."
45462,False,0,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,2011-11-17,0.0,360.0,Released,,Century of Birthing,9.0,3.0,"[0.01217238, -0.015613253, -0.016708953, -0.01..."
45463,False,0,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,2003-08-01,0.0,90.0,Released,A deadly game of wits.,Betrayal,3.8,6.0,"[-0.018731084, 0.031341173, -0.03445673, 0.038..."
45464,False,0,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,1917-10-21,0.0,87.0,Released,,Satan Triumphant,0.0,0.0,"[-0.0097106, 0.10046432, -0.061511513, 0.04093..."


In [16]:
df.to_csv('final.csv', index=False)

In [None]:
import json
import ast

df_final = pd.read_csv('final.csv')


def make_json(row):
    return json.dumps({
        'id': row['id'],
        'imbd_id': row['imdb_id'],
        'overview': row['overview'],
    })

df_final['metadata'] = df_final.apply(make_json, axis=1)
df_final = df_final[['id', 'embeddings', 'metadata']]


df_final['id_num'] = pd.to_numeric(df_final['id'], errors='coerce')
df_final = df_final[df_final['id_num'].notna()]

def parse_vec(s):
    nums = s.strip("[]").split()
    return [float(x) for x in nums]
df_final["vector"] = df_final["embeddings"].apply(parse_vec)
df_final["metadata_dict"] = df_final["metadata"].apply(json.loads)

with open("data.jsonl", "w") as f:
    for _, row in df_final.iterrows():
        rec = {
            "id": int(row["id_num"]),
            "vector": row["vector"],
            **row["metadata_dict"]
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

  df_final = pd.read_csv('final.csv')


: 