In [24]:
!pip install sentence_transformers tqdm



In [26]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import pandas as pd
import dask.dataframe as dd
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import torch

In [5]:
desc_df = pd.read_csv('/content/sample_data/movie_description_clean.csv')

In [15]:
def encode_and_format(text):
    # Generate embeddings
    model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
    embedding = model.encode(text)
    # Convert embeddings to a comma-separated string
    return embedding.tolist()

In [16]:
desc_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,About
0,0,tt0000012,The Arrival of a Train,"""The Arrival of a Train"" is a 1896 silent shor..."
1,1,tt0000417,A Trip to the Moon,"""A Trip to the Moon"" (1902) is a classic silen..."
2,2,tt0000439,The Great Train Robbery,"""The Great Train Robbery"" is a 1978 British cr..."
3,3,tt0006864,Intolerance,"""Intolerance"" (1916) is a silent epic directed..."
4,4,tt0009968,Broken Blossoms,"""Broken Blossoms"" is a 1919 silent film direct..."


In [17]:
Temp = desc_df[:1]['primaryTitle'].apply(encode_and_format)

In [18]:
Temp

0    [-0.4307757616043091, 0.1945396214723587, 0.45...
Name: primaryTitle, dtype: object

In [19]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

In [23]:
# dask_df = dd.from_pandas(desc_df, npartitions=50)  # Adjust npartitions based on your dataset size and memory

# # Apply the function using Dask with correct meta syntax
# dask_df['Embedding'] = dask_df.map_partitions(lambda df: df['About'].apply(encode_and_format),
#                                                meta=('Embedding', object))  # Correctly specify meta as a tuple

# # Compute the result (trigger computation)
# result_df = dask_df.compute()

# # Display the first few rows of the DataFrame
# print(result_df.head())

[                                        ] | 0% Completed | 114.01 s
[                                        ] | 0% Completed | 114.10 s


KeyboardInterrupt: 

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1').to(device)

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Convert the About column to a list for processing
texts = desc_df['About'].tolist()

# Prepare dataset and dataloader
dataset = TextDataset(texts)
batch_size = 100  # Define batch size that suits your GPU memory
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [28]:
def encode_texts(data_loader):
    model.eval()  # Set model to evaluation mode
    embeddings = []

    with torch.no_grad():  # Inference mode
        for texts in tqdm(data_loader, desc="Encoding", unit="batch"):
            # Process each batch of texts
            encoded = model.encode(texts, convert_to_tensor=True, show_progress_bar=False)
            encoded = encoded.cpu().numpy()  # Move tensors back to CPU and convert to numpy
            embeddings.extend(encoded)

    return np.array(embeddings)

# Encode texts and observe progress
embeddings = encode_texts(data_loader)


Encoding: 100%|██████████| 75/75 [02:38<00:00,  2.11s/batch]


In [29]:
embeddings_list = embeddings.tolist()  # Convert each embedding numpy array to a list
desc_df['Embeddings'] = embeddings_list

# Display the updated DataFrame
print(desc_df.head())

   Unnamed: 0     tconst             primaryTitle  \
0           0  tt0000012   The Arrival of a Train   
1           1  tt0000417       A Trip to the Moon   
2           2  tt0000439  The Great Train Robbery   
3           3  tt0006864              Intolerance   
4           4  tt0009968          Broken Blossoms   

                                               About  \
0  "The Arrival of a Train" is a 1896 silent shor...   
1  "A Trip to the Moon" (1902) is a classic silen...   
2  "The Great Train Robbery" is a 1978 British cr...   
3  "Intolerance" (1916) is a silent epic directed...   
4  "Broken Blossoms" is a 1919 silent film direct...   

                                          Embeddings  
0  [-0.6185780167579651, 0.3374672532081604, 0.87...  
1  [-0.09322287887334824, 0.7676082849502563, 0.3...  
2  [0.700336217880249, 0.25397372245788574, 0.077...  
3  [-0.0787254050374031, 0.4562748074531555, 0.61...  
4  [-1.0412063598632812, -0.3301662802696228, 0.7...  


In [30]:
desc_df.to_csv('/content/sample_data/movie_description_clean_embeddings.csv', index=False)

In [31]:
df_size_bytes = desc_df.memory_usage(deep=True).sum()
df_size_mb = df_size_bytes / (1024 ** 2)
print(f"DataFrame size: {df_size_mb:.2f} MB")

DataFrame size: 62.36 MB
