In [1]:
import pandas as pd
import os
from openai import OpenAI
import tiktoken
import numpy as np
from tqdm import tqdm

In [72]:
df = pd.read_csv("ucdavis_health_details.csv")

In [73]:
df.shape

(3085, 2)

In [74]:
df.head()

Unnamed: 0,url,text
0,https://health.ucdavis.edu/,"Personalized, compassionate care and the exper..."
1,https://health.ucdavis.edu//welcome/,"Personalized, compassionate care and the exper..."
2,https://health.ucdavis.edu//patients-visitors/,Our highly rated doctors partner with you and ...
3,https://health.ucdavis.edu//patients-visitors/...,Our highly rated doctors partner with you and ...
4,https://health.ucdavis.edu//healthcare-profess...,"Discover news, research, and innovations from ..."


In [82]:
## data cleaning
#df.drop(columns='url',inplace=True)
df.drop_duplicates(subset=['text'],inplace=True)
df.dropna(inplace=True)

In [83]:
# Remove any non-string entries from the 'text' column
df = df[df['text'].apply(lambda x: isinstance(x, str) and x.strip() != '')]

In [87]:
df.shape

(2300, 2)

In [88]:
# Function to truncate text to the maximum token limit
def truncate_text(text, max_tokens=6000):
    # Estimate tokens assuming 1 token ~ 4 characters
    max_length = max_tokens * 4
    return text[:max_length]

# Truncate texts that exceed the token limit
df['text'] = df['text'].apply(lambda x: truncate_text(x))

In [89]:
# Set your API key
openai.api_key = os.getenv("api_key")
client = OpenAI()

In [90]:
# Initialize the tokenizer for the model
tokenizer = tiktoken.get_encoding('p50k_base')  # Use the appropriate encoding for your model

def get_embedding(text, model='text-embedding-3-small', max_tokens=7000):
    # Tokenize the text and truncate if necessary
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens)
    
    return client.embeddings.create(input=[text],model=model).data[0].embedding

In [91]:
tqdm.pandas()
df['embedding'] = df['text'].progress_apply(lambda x: get_embedding(x))

100%|███████████████████████████████████████| 2300/2300 [08:20<00:00,  4.59it/s]


In [93]:
df.dropna(inplace=True)

In [94]:
df.to_csv('ucdavis_health_embeddings.csv',index=False)