In [1]:
import pandas as pd
import tiktoken
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(api_key="OPENAI_API_KEY")

def get_embedding(text, model="GPT-4o"):
    # Clean and preprocess the text
    text = text.replace("\n", " ")
    # Generate the embedding
    return client.embeddings.create(input=[text], model=model).data[0].embedding


In [2]:
# Load your dataset
file_path = "/home1/nhuynh2023/datasets/PDF_HEA_Gibbs/HEA_Dataset.csv"
dataset = pd.read_csv(file_path)

# Create combined prompt column
dataset["prompt"] = (
    "Alloy composition: " +
    dataset["Fe"].astype(str) + ", Co=" +
    dataset["Co"].astype(str) + ", Ni=" +
    dataset["Ni"].astype(str) + ", Cu=" +
    dataset["Cu"].astype(str) + ", Zn=" +
    dataset["Zn"].astype(str) +
    "\nActive sites: " + dataset["active_site_1"] + ", " + dataset["active_site_2"]
)


In [3]:
# Token limit and encoding setup
embedding_model = "GPT-4o"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # Token limit for the embedding model

encoding = tiktoken.get_encoding(embedding_encoding)

# Calculate the number of tokens in each prompt
dataset["n_tokens"] = dataset["prompt"].apply(lambda x: len(encoding.encode(x)))

# Filter out prompts that exceed the token limit
dataset = dataset[dataset["n_tokens"] <= max_tokens]


In [4]:
# Generate embeddings for all samples
dataset["embedding"] = dataset["prompt"].apply(lambda x: get_embedding(x, model=embedding_model))

# Save the dataset with embeddings
output_path = "/home1/nhuynh2023/datasets/PDF_HEA_Gibbs/HEA_Dataset_with_embeddings.csv"  # Specify your output path
dataset.to_csv(output_path, index=False)

print(f"Embeddings saved to {output_path}")


Embeddings saved to /home1/nhuynh2023/datasets/PDF_HEA_Gibbs/HEA_Dataset_with_embeddings.csv
