### Generate Embeddings to JSON Files

In [1]:
import os

from dotenv import load_dotenv
from openai import AzureOpenAI

In [4]:
load_dotenv(override=True)

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_EMBEDDING_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
)

In [5]:
response = client.embeddings.create(
    input="Hello world",
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
)

print(len(response.data[0].embedding))
print(response.data[0].embedding)

1536
[-0.002078542485833168, -0.04908587411046028, 0.020946789532899857, 0.03135102614760399, -0.04530530795454979, -0.026402482762932777, -0.028999701142311096, 0.06030462309718132, -0.02571091614663601, -0.01482258178293705, 0.015444992110133171, -0.029983261600136757, -0.020393535494804382, -0.03334888815879822, 0.025833861902356148, 0.014207855798304081, -0.0700787678360939, 0.012432834133505821, 0.014791845344007015, 0.048839982599020004, 0.02073163539171219, -0.008890475146472454, -0.015114576555788517, -0.01661297120153904, 0.025926070287823677, -0.002902659587562084, -0.024327782914042473, 0.024281678721308708, 0.0017433245666325092, -0.055724915117025375, 0.02308296225965023, -0.045489728450775146, -0.008652268908917904, 0.0031619970686733723, 0.004583551082760096, 0.001794231589883566, 0.02669447846710682, 0.010158347897231579, -0.012056314386427402, -0.011472324840724468, -0.01491479016840458, -0.023129066452383995, 0.025357449427247047, 0.036822088062763214, -0.035500429570

In [6]:
import json
from ipywidgets import IntProgress
from IPython.display import display


In [7]:
transformed_dir = "../data/transformed"
embedded_dir = "../data/embedded"

if not os.path.exists(embedded_dir):
    os.makedirs(embedded_dir)
    

In [8]:
def prepare_embedding_str(review_json):
    return f"REVIEW_TITLE: {review_json['review_title']} REVIEW_TEXT: {review_json['review_text']} HOTEL_NAME: {review_json['hotel_name']} HOTEL_CITY: {review_json['city']} HOTEL_STATE: {review_json['hotel_state']}"

In [9]:
files = os.listdir(transformed_dir)
total_files = len(files)

print(f"Total files to process: {total_files}")

Total files to process: 999


In [10]:
max_count = total_files
progress_bar = IntProgress(min=0, max=max_count, description="Progress:")

display(progress_bar)

for file in os.listdir(transformed_dir):
    with open(f"{transformed_dir}/{file}", "r") as f:
        review = json.load(f)

        # start here
        embedding_str = prepare_embedding_str(review)
        response = client.embeddings.create(
            input=embedding_str,
            model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
        )

        review['embedding'] = response.data[0].embedding

        with open(f"{embedded_dir}/{file}", "w") as f:
            json.dump(review, f, indent=2)

        print(
            f"Processed file {file} with embedding length {len(review['embedding'])}")

        progress_bar.value += 1

IntProgress(value=0, description='Progress:', max=999)

Processed file review_1.json with embedding length 1536
Processed file review_10.json with embedding length 1536
Processed file review_100.json with embedding length 1536
Processed file review_101.json with embedding length 1536
Processed file review_102.json with embedding length 1536
Processed file review_103.json with embedding length 1536
Processed file review_104.json with embedding length 1536
Processed file review_105.json with embedding length 1536
Processed file review_106.json with embedding length 1536
Processed file review_107.json with embedding length 1536
Processed file review_108.json with embedding length 1536
Processed file review_109.json with embedding length 1536
Processed file review_11.json with embedding length 1536
Processed file review_110.json with embedding length 1536
Processed file review_111.json with embedding length 1536
Processed file review_112.json with embedding length 1536
Processed file review_113.json with embedding length 1536
Processed file rev