# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [10]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [11]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

In [12]:
# TODO: Create a .env file with the following variables
# OPENAI_API_KEY="YOUR_KEY"
# CHROMA_OPENAI_API_KEY="YOUR_KEY"
# TAVILY_API_KEY="YOUR_KEY"

In [None]:
# Load environment variables
load_dotenv()

# Validate required API keys with helpful error messages
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError(
        'OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with OPENAI_API_KEY="your_key"'
    )

chroma_api_key = os.getenv('CHROMA_OPENAI_API_KEY') or openai_api_key
if not chroma_api_key:
    raise ValueError(
        'CHROMA_OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with CHROMA_OPENAI_API_KEY="your_key"'
    )

print('✅ API keys loaded successfully!')

### VectorDB Instance

In [None]:
# Instantiate ChromaDB Client
chroma_client = chromadb.PersistentClient(path="chromadb")

### Collection

In [16]:
# TODO: Pick one embedding function
# If picking something different than openai, 
# make sure you use the same when loading it
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_base="https://openai.vocareum.com/v1",  # <-- Add this line for Vocareum
    model_name="text-embedding-ada-002"
)

In [17]:
# TODO: Create a collection
# Choose any name you want
collection = chroma_client.get_or_create_collection(
    name="udaplay",
    embedding_function=embedding_fn,
    metadata={"hnsw:space": "cosine"}
)
print(f"Successfully loaded or created collection '{collection.name}'.")

Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Successfully loaded or created collection 'udaplay'.


### Game Data Processing and Importing

In [None]:
# Load and process game data
def load_game_data(data_dir):
    game_data = []
    for file_name in sorted(os.listdir(data_dir)):
        if not file_name.endswith('.json'):
            continue
        file_path = os.path.join(data_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            game = json.load(f)
        game_data.append(game)
    return game_data

# Insert data into ChromaDB
def insert_data_into_chromadb(collection, game_data):
    for game in game_data:
        content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
        doc_id = f"{game['Name'].replace(' ', '_')}"
        collection.add(
            ids=[doc_id],
            documents=[content],
            metadatas=[game]
        )
    print('Data inserted into ChromaDB.')

# Load game data and insert
data_dir = 'games'
game_data = load_game_data(data_dir)
insert_data_into_chromadb(collection, game_data)


In [None]:
# Ensure the directory "games" exists
data_dir = "games"

# Iterate over all JSON files and add them to ChromaDB
for file_name in sorted(os.listdir(data_dir)):
    if not file_name.endswith(".json"):
        continue
    file_path = os.path.join(data_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        game = json.load(f)
    content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
    doc_id = os.path.splitext(file_name)[0]
    collection.add(
        ids=[doc_id],
        documents=[content],
        metadatas=[game]
    )
print("Documents added to ChromaDB.")