In [22]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import json


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


In [12]:
df = pd.read_csv("Encoded_Data.csv")

In [13]:
df.columns

Index(['tmdb_id', 'adult', 'belongs_to_collection', 'collection_id', 'budget',
       'overview', 'production_company_id', 'production_company_name',
       'release_month', 'release_year', 'revenue', 'tagline', 'director',
       'actor_0_name', 'actor_0_id', 'actor_0_gender', 'actor_1_name',
       'actor_1_id', 'actor_1_gender', 'actor_2_name', 'actor_2_id',
       'actor_2_gender', 'actor_3_name', 'actor_3_id', 'actor_3_gender',
       'actor_4_name', 'actor_4_id', 'actor_4_gender', 'actor_birthdays',
       'Gender Ratio', 'Average Actor Age', 'runtime', 'Crime', 'Drama', 'War',
       'Western', 'Family', 'Thriller', 'Fantasy', 'Mystery', 'Animation',
       'Music', 'Romance', 'Adventure', 'History', 'Science Fiction', 'Comedy',
       'TV Movie', 'Documentary', 'Horror', 'Action', 'actor0_rev',
       'actor0_movies', 'actor1_rev', 'actor1_movies', 'actor2_rev',
       'actor2_movies', 'actor_0_name_encoded', 'actor_1_name_encoded',
       'actor_2_name_encoded'],
      dtype='

In [14]:
df['overview'].fillna('', inplace=True)
df['tagline'].fillna('', inplace=True)

df['overview'] = df['overview'].apply(str)
df['tagline'] = df['tagline'].apply(str)


In [15]:
def encode_text_batch(text_list, tokenizer, model, batch_size=32):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        all_embeddings.extend(embeddings)
    
    return all_embeddings


In [16]:
features = ['overview', 'tagline']
for feature in features:
    embeddings = encode_text_batch(df[feature].tolist(), tokenizer, model, batch_size=32)
    df[feature + '_embed'] = embeddings
 

In [17]:
len(df.overview_embed[0])

768

In [19]:
len(df.tagline_embed[0])

768

In [20]:
df.tagline_embed[0]

array([-1.05340111e+00, -4.17696059e-01, -3.74945998e-01, -1.95810929e-01,
       -4.25235480e-02, -1.69917792e-01,  2.70007730e-01,  4.61596638e-01,
       -8.00336123e-01, -8.90525401e-01,  2.09924594e-01, -2.04895258e-01,
        4.75863338e-01,  9.11633730e-01, -1.00525267e-01,  8.33216533e-02,
       -1.22421825e+00,  4.67156798e-01,  3.22507769e-01, -8.61270487e-01,
        6.16715670e-01, -5.03304750e-02, -2.31207177e-01,  1.14422292e-01,
        5.97352147e-01, -3.55271518e-01,  3.81158203e-01,  2.74519950e-01,
       -5.55211484e-01,  4.93980795e-01, -1.87905312e-01,  5.55693924e-01,
       -4.29514349e-01,  1.36305833e+00,  1.03442177e-01, -6.51053369e-01,
        3.49200577e-01,  2.05711648e-02,  6.54314101e-01,  7.90403843e-01,
        3.46953777e-04, -4.64193732e-01,  6.85759783e-01, -3.69947404e-01,
        1.98885590e-01,  2.98309118e-01, -2.34396315e+00,  2.67198294e-01,
        8.84705931e-02,  1.57402724e-01, -4.94269729e-02, -4.08771671e-02,
        5.50287962e-01,  

In [24]:
df['overview_embed'] = df['overview_embed'].apply(lambda x: json.dumps(x.tolist()))
df['tagline_embed'] = df['tagline_embed'].apply(lambda x: json.dumps(x.tolist()))


In [25]:
df.head(1)

Unnamed: 0,tmdb_id,adult,belongs_to_collection,collection_id,budget,overview,production_company_id,production_company_name,release_month,release_year,...,actor0_movies,actor1_rev,actor1_movies,actor2_rev,actor2_movies,actor_0_name_encoded,actor_1_name_encoded,actor_2_name_encoded,overview_embed,tagline_embed
0,96128,False,False,,5700,"A woman, with the aid of her police officer sw...",111965.0,Independent Moving Pictures,11.0,1913.0,...,0,0,0,0,0,0.0,0.0,0.0,"[-0.6060653924942017, 0.15855203568935394, -0....","[-1.0534011125564575, -0.4176960587501526, -0...."


In [28]:
df.to_csv("Encoded_Data_encoded.csv", index=False)

## Sample to read in Encoded_Data_encoded.csv

In [6]:
df = pd.read_csv('Encoded_Data_encoded.csv')

In [29]:
df.head(1)

Unnamed: 0,tmdb_id,adult,belongs_to_collection,collection_id,budget,overview,production_company_id,production_company_name,release_month,release_year,...,actor0_movies,actor1_rev,actor1_movies,actor2_rev,actor2_movies,actor_0_name_encoded,actor_1_name_encoded,actor_2_name_encoded,overview_embed,tagline_embed
0,96128,False,False,,5700,"A woman, with the aid of her police officer sw...",111965.0,Independent Moving Pictures,11.0,1913.0,...,0,0,0,0,0,0.0,0.0,0.0,"[-0.6060653924942017, 0.15855203568935394, -0....","[-1.0534011125564575, -0.4176960587501526, -0...."


In [32]:
df['overview_embed'] = df['overview_embed'].apply(json.loads)
df['tagline_embed'] = df['tagline_embed'].apply(json.loads)

In [33]:
df.head(1)

Unnamed: 0,tmdb_id,adult,belongs_to_collection,collection_id,budget,overview,production_company_id,production_company_name,release_month,release_year,...,actor0_movies,actor1_rev,actor1_movies,actor2_rev,actor2_movies,actor_0_name_encoded,actor_1_name_encoded,actor_2_name_encoded,overview_embed,tagline_embed
0,96128,False,False,,5700,"A woman, with the aid of her police officer sw...",111965.0,Independent Moving Pictures,11.0,1913.0,...,0,0,0,0,0,0.0,0.0,0.0,"[-0.6060653924942017, 0.15855203568935394, -0....","[-1.0534011125564575, -0.4176960587501526, -0...."


In [34]:
len(df.overview_embed[0])

768

In [35]:
df.overview_embed[0]

[-0.6060653924942017,
 0.15855203568935394,
 -0.30640944838523865,
 0.14413851499557495,
 0.009374142624437809,
 0.15875253081321716,
 0.05634134262800217,
 0.13884319365024567,
 0.09205900877714157,
 0.0580303855240345,
 -0.2989707291126251,
 0.5111544132232666,
 0.1500828117132187,
 1.1291334629058838,
 0.6911958456039429,
 0.42272934317588806,
 0.004518207162618637,
 0.25257575511932373,
 0.49423667788505554,
 -0.326384037733078,
 -0.1685331165790558,
 -0.597131073474884,
 0.39774656295776367,
 0.5422343611717224,
 -0.13373897969722748,
 0.3793506622314453,
 -0.3869115710258484,
 0.14979946613311768,
 -0.06045251712203026,
 0.4717200994491577,
 0.003959898371249437,
 -0.3826150894165039,
 -0.6329940557479858,
 -0.9973541498184204,
 0.35046836733818054,
 0.23389017581939697,
 0.36640211939811707,
 0.21136681735515594,
 0.045608509331941605,
 0.6200339198112488,
 -0.3297504782676697,
 -0.1313825249671936,
 -0.023528676480054855,
 0.46954822540283203,
 0.22736218571662903,
 -0.91700452