In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
df = pd.read_csv("/shared/3/projects/bowenyi/Floyd_Month/28/combined_tmdb_data.csv")

In [4]:
df.columns

Index(['adult', 'budget', 'genres', 'id', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
df['overview'].fillna('', inplace=True)
df['tagline'].fillna('', inplace=True)

df['overview'] = df['overview'].apply(str)
df['tagline'] = df['tagline'].apply(str)

In [6]:
def encode_text_batch(text_list, tokenizer, model, batch_size=32):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        # Process in batches
        batch_texts = text_list[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        all_embeddings.extend(embeddings)
    
    return all_embeddings



In [None]:
features = ['overview', 'tagline']
for feature in features:
    df[feature + '_embed'] = encode_text_batch(df[feature].tolist(), tokenizer, model, batch_size=32)

In [8]:
df.to_csv("combined_with_encodings.csv")

In [10]:
df.shape

(8887, 22)