In [None]:
# !pip install -q sentence-transformers einops

In [2]:
import torch
torch.cuda.empty_cache()

In [None]:
import torch
from datasets import load_dataset, DatasetDict
from sentence_transformers import SentenceTransformer
from huggingface_hub import login
import pandas as pd

# Authenticate with Hugging Face
login(token="hf_api_key")

# Load the dataset from Hugging Face with train and validation splits
dataset = load_dataset("daparasyte/gpt4_dataset_prompt_scores")

# Initialize the Jina Embeddings v3 model on GPU
embedding_model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")

# Process the train split
print("Generating embeddings for the train split...")
train_prompts = dataset["train"]["prompt"]
train_scores = dataset["train"]["score"]
train_embeddings = embedding_model.encode(train_prompts, batch_size=32, show_progress_bar=True)

# Convert embeddings to a list of lists
train_embeddings = [emb.tolist() for emb in train_embeddings]

# Create DataFrame for train split
train_data = pd.DataFrame({
    "prompt": train_prompts,
    "score": train_scores,
    "embedding": train_embeddings
})

# Process the validation split
print("Generating embeddings for the validation split...")
val_prompts = dataset["validation"]["prompt"]
val_scores = dataset["validation"]["score"]
val_embeddings = embedding_model.encode(val_prompts, batch_size=32, show_progress_bar=True)
val_embeddings = [emb.tolist() for emb in val_embeddings]

# Create DataFrame for validation split
val_data = pd.DataFrame({
    "prompt": val_prompts,
    "score": val_scores,
    "embedding": val_embeddings
})

# Combine train and validation data into a Hugging Face Dataset
from datasets import Dataset, DatasetDict

hf_train_dataset = Dataset.from_pandas(train_data)
hf_val_dataset = Dataset.from_pandas(val_data)

dataset_dict = DatasetDict({
    "train": hf_train_dataset,
    "validation": hf_val_dataset
})

# Push to Hugging Face
dataset_dict.push_to_hub("daparasyte/gpt4_dataset_prompt_scores_with_embeddings")
print("Dataset with embeddings uploaded successfully!")