### Step 1 - Converting CSV to JSON

In [1]:
import csv
import json
import uuid
import os

In [2]:
raw_reviews_file = "../../ai-foundry-trg-data/hotel_reviews_1000.csv"
transformed_dir = "../data/transformed"

raw_reviews = open(raw_reviews_file, "r").readlines()

if not os.path.exists(transformed_dir):
    os.makedirs(transformed_dir)

In [3]:
def process_reviews(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        # Read the first line to get the header
        header = next(csv.reader(csvfile))

        # Create a mapping of expected column names to actual column names
        column_mapping = {
            'dateAdded': 'dateAdded',
            'city': 'city',
            'hotel_name': 'name',
            'hotel_state': 'province',
            'review_text': 'reviews.text',
            'review_title': 'reviews.title'
        }

        # Find the index of each required column
        column_indices = {}
        for expected_name, actual_name in column_mapping.items():
            try:
                column_indices[expected_name] = header.index(actual_name)
            except ValueError:
                print(
                    f"Warning: Column '{actual_name}' not found in the CSV. Some data may be missing.")

        # Reset file pointer to the beginning
        csvfile.seek(0)

        # Skip the header row
        next(csvfile)

        # Use csv.reader instead of DictReader
        reader = csv.reader(csvfile)

        for i, row in enumerate(reader, start=1):
            review_json = {}
            for key, index in column_indices.items():
                if index < len(row):
                    review_json[key] = row[index]
                else:
                    # or None, depending on your preference
                    review_json[key] = ""

            # Generate a unique identifier
            review_json['id'] = str(uuid.uuid4())

            # print(json.dumps(review_json, indent=2))
            print(f"processed record [{i}] with id [{review_json['id']}]")

            with open(f"{transformed_dir}/review_{i}.json", "w+") as f:
                json.dump(review_json, f, indent=2)

In [4]:
process_reviews(raw_reviews_file)

print(f"Processed {len(raw_reviews) - 1} reviews and saved to {transformed_dir}")
print("All reviews have been processed and saved as individual JSON files.")

processed record [1] with id [14e42e6a-d6ba-475f-86a5-13e1c82bed94]
processed record [2] with id [4b9bc073-68e7-4060-8d22-df04be2c3f30]
processed record [3] with id [de654721-47f5-4477-8f47-e39ada410a84]
processed record [4] with id [e4d2963f-3347-4b87-9295-63f5ef45654d]
processed record [5] with id [36c4c145-5b33-4446-b2f2-6737d3ab8d7d]
processed record [6] with id [7dc09d04-49e1-44c7-a620-913ffda35d7b]
processed record [7] with id [39d7635f-322f-447f-8077-98b373f805e2]
processed record [8] with id [ac020a85-b465-479b-83ab-aa42f8607fac]
processed record [9] with id [56911a67-e3e3-440c-ad8a-7ec55ad107e6]
processed record [10] with id [6568d61e-5fcd-4a6e-a05b-92a1d9696708]
processed record [11] with id [82ac4aad-2ccf-4cff-b104-b6b3f8ce7e03]
processed record [12] with id [15d51527-b176-4399-992f-becfb728ac27]
processed record [13] with id [a79531c0-4ee5-4c32-ba40-f7f794b55c7a]
processed record [14] with id [6a78cc76-b82c-43f3-b0f3-675679639304]
processed record [15] with id [1a3cd246-2bc