In [3]:
from typing import List, Optional
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from google.colab import auth
auth.authenticate_user(project_id="CREDENTIAL")

def embed_text(
    texts: List[str] = ["banana muffins? ", "banana bread? banana muffins?"],
    task: str = "SEMANTIC_SIMILARITY",
    model_name: str = "text-embedding-004",
    dimensionality: Optional[int] = 256,
) -> List[List[float]]:
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)
    return [embedding.values for embedding in embeddings]

In [4]:
import datasets
import pandas as pd
from scipy import spatial

df = pd.read_csv("dataset.csv") #Dataset expected to have columns ["Prompt", "Reference Answer", "Candidate Answer 1", "Candidate Answer 2"]
dataset = {"prompt":[], "chosen":[], "rejected":[]}

for index, row in df.iterrows():
  dataset['prompt'].append(row['Prompt'])
  doc_vecs = embed_text([
      df.loc[index, 'Candidate Answer 1'], df.loc[index, 'Candidate Answer 2'], df.loc[index, 'Reference Answer']
  ])

  sim1 = 1 - spatial.distance.cosine(doc_vecs[0], doc_vecs[2])
  sim2 = 1 - spatial.distance.cosine(doc_vecs[1], doc_vecs[2])
  if sim1 > sim2:
    dataset['chosen'].append(df.loc[index, 'Candidate Answer 1'])
    dataset['rejected'].append(df.loc[index, 'Candidate Answer 2'])
  else:
    dataset['chosen'].append(df.loc[index, 'Candidate Answer 2'])
    dataset['rejected'].append(df.loc[index, 'Candidate Answer 1'])



dataset = datasets.Dataset.from_dict(dataset)


In [6]:
import pickle
with open('dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f)