In [1]:
import csv
import json
import uuid
import os

raw_reviews_file = "../data/raw/hotel_reviews_1000.csv"
transformed_dir = "../data/transformed"

raw_reviews = open(raw_reviews_file, "r").readlines()

if not os.path.exists(transformed_dir):
    os.makedirs(transformed_dir)

def process_reviews(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        # Read the first line to get the header
        header = next(csv.reader(csvfile))
        
        # Create a mapping of expected column names to actual column names
        column_mapping = {
            'dateAdded': 'dateAdded',
            'city': 'city',
            'hotel_name': 'name',
            'hotel_state': 'province',
            'review_text': 'reviews.text',
            'review_title': 'reviews.title'
        }
        
        # Find the index of each required column
        column_indices = {}
        for expected_name, actual_name in column_mapping.items():
            try:
                column_indices[expected_name] = header.index(actual_name)
            except ValueError:
                print(f"Warning: Column '{actual_name}' not found in the CSV. Some data may be missing.")
        
        # Reset file pointer to the beginning
        csvfile.seek(0)
        
        # Skip the header row
        next(csvfile)
        
        # Use csv.reader instead of DictReader
        reader = csv.reader(csvfile)
        
        for i, row in enumerate(reader, start=1):
            review_json = {}
            for key, index in column_indices.items():
                if index < len(row):
                    review_json[key] = row[index]
                else:
                    review_json[key] = ""  # or None, depending on your preference
            
            # Generate a unique identifier
            review_json['id'] = str(uuid.uuid4())
            
            # print(json.dumps(review_json, indent=2))
            print(f"processed record [{i}] with id [{review_json['id']}]")

            with open(f"{transformed_dir}/review_{i}.json", "w+") as f:
                json.dump(review_json, f, indent=2)
            
process_reviews(raw_reviews_file)

processed record [1] with id [e27b143c-73dd-4dd9-8a27-1b430987b0b8]
processed record [2] with id [a8f5b71b-bf2e-4b7a-b655-4bafebdfb157]
processed record [3] with id [d4db11a2-295c-4074-912f-e1e15929771d]
processed record [4] with id [c6de4507-419b-4b82-8de3-233348b0c15a]
processed record [5] with id [79f2ded5-3e6f-49ce-be04-713da3492bc8]
processed record [6] with id [5818ebb0-830a-45a3-a6ba-39ee06ff029e]
processed record [7] with id [79384e39-b0a5-4fd7-a70f-462421942f02]
processed record [8] with id [9542f58a-9399-429d-a764-f71b6a47bfda]
processed record [9] with id [651ba391-ab61-42d2-9f81-9359460d27ef]
processed record [10] with id [1c57de6f-81f0-41f7-8a5c-8c0ce3287341]
processed record [11] with id [c5b6af24-3773-41da-b012-26b03194dd7c]
processed record [12] with id [ae0896b3-ec4d-4fee-98d2-9e9d48efc6d6]
processed record [13] with id [acc0a404-7fe5-457e-a954-4cea619f34c8]
processed record [14] with id [c71fd8b7-2b68-474a-9d40-b51be6b07097]
processed record [15] with id [da4187dd-ad9

In [2]:
%pip install -q python-dotenv openai

Note: you may need to restart the kernel to use updated packages.


In [3]:
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

# print(os.getenv("OPENAI_API_KEY"))

client = OpenAI()

response = client.embeddings.create(
    input="Hello World!",
    model="text-embedding-3-small"
)

print(len(response.data[0].embedding))
print(response.data[0].embedding)

1536
[-0.003034229390323162, -0.056672804057598114, 0.029482627287507057, 0.042976152151823044, -0.04082879424095154, -0.025202423334121704, -0.012789830565452576, 0.03522825613617897, -0.031571947038173676, -0.011135785840451717, -0.015887537971138954, -0.031107652932405472, -0.020269306376576424, -0.0247381292283535, 0.029642228037118912, 0.03586665913462639, -0.03813008964061737, 0.017817256972193718, 0.011396950110793114, 0.0407707579433918, 0.047561049461364746, 0.0025100859347730875, -0.006355015095323324, -0.013943308964371681, 0.0348220020532608, -0.01222397293895483, -0.044398050755262375, 0.018499188125133514, 0.02363543212413788, -0.04338240996003151, 0.045007433742284775, -0.036272916942834854, -0.010221707634627819, 0.0059560127556324005, 0.006220805458724499, 0.0006257077911868691, -0.0016730882925912738, 0.006442070007324219, -0.0014282461488619447, -0.023954633623361588, 0.0121006453409791, -0.027756035327911377, 0.010286998935043812, 0.040277447551488876, -0.0520588904

In [5]:
import os
import json

transformed_dir = "../data/transformed"
embedded_dir = "../data/embedded"

if not os.path.exists(embedded_dir):
    os.makedirs(embedded_dir)
    
def prepare_embedding_str(review_json):
    return f"REVIEW_TITLE: {review_json['review_title']} REVIEW_TEXT: {review_json['review_text']} HOTEL_NAME: {review_json['hotel_name']} HOTEL_CITY: {review_json['city']} HOTEL_STATE: {review_json['hotel_state']}"
    
client = OpenAI()
for file in os.listdir(transformed_dir):
    with open(f"{transformed_dir}/{file}", "r") as f:
        review = json.load(f)
        embedding_str = prepare_embedding_str(review)
        
        response = client.embeddings.create(
            input=embedding_str,
            model="text-embedding-3-small"
        )
        
        review['embedding'] = response.data[0].embedding
        
        with open(f"{embedded_dir}/{file}", "w") as f:
            json.dump(review, f, indent=2)