### Loading the necessary libraries

In [1]:
from __future__ import annotations

import os
from pathlib import Path

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY is not set. Create a .env file with OPENAI_API_KEY=...")

env_model = os.getenv("OPENAI_MODEL", "gpt-4o-mini")

SUPPORTED_ASSISTANT_MODELS = {
    "gpt-4o",
    "gpt-4o-mini",
    "gpt-4-turbo",
    "gpt-4-turbo-preview",
    "gpt-4",
    "gpt-3.5-turbo",
}

model_chat = env_model if env_model in SUPPORTED_ASSISTANT_MODELS else "gpt-4o-mini"
model_embedding = os.getenv("OPENAI_MODEL_EMBEDDING")

vector_store_id_env = os.getenv("VECTOR_STORE_ID")


### Creating an OpenAI client

In [2]:
client = OpenAI(api_key=api_key)

print(f"Client ready. Model (chat): {model_chat}")
if model_embedding:
    print(f"Embedding model: {model_embedding}")
if vector_store_id_env:
    print(f"VECTOR_STORE_ID found in env: {vector_store_id_env}")


Client ready. Model (chat): gpt-4o-mini
Embedding model: text-embedding-3-small
VECTOR_STORE_ID found in env: vs_696938b687f88191a79aa6b70c012dbb


### Vector store helpers

In [3]:
def list_vector_stores(limit: int = 100):
    stores = client.vector_stores.list(limit=limit)
    print(f"Total vector stores returned: {len(stores.data)}")
    for vs in stores.data:
        print(f"{vs.id} | {vs.name} | created_at={vs.created_at}")
    return stores


def create_or_get_vector_store(name: str, description: str | None = None):
    if not name.strip():
        raise ValueError("Vector store name must be non-empty")

    stores = client.vector_stores.list(limit=100)
    for vs in stores.data:
        if vs.name == name:
            print("Vector store already exists:")
            print(f"  Name: {vs.name}")
            print(f"  ID:   {vs.id}")
            return vs

    vs = client.vector_stores.create(name=name, description=description)
    print("Vector store created:")
    print(f"  Name: {vs.name}")
    print(f"  ID:   {vs.id}")
    return vs


### Creating a vector store

In [4]:
VECTOR_STORE_NAME = "my-knowledge-base"
VECTOR_STORE_DESCRIPTION = "Documents for the Streamlit RAG app"

vector_store = None

if vector_store_id_env:
    try:
        vector_store = client.vector_stores.retrieve(vector_store_id_env)
        print("Using existing vector store from .env")
        print(f"  Name: {vector_store.name}")
        print(f"  ID:   {vector_store.id}")
    except Exception as e:
        print(f"Could not retrieve VECTOR_STORE_ID from env ({vector_store_id_env}): {e}")
        vector_store = None

if vector_store is None:
    vector_store = create_or_get_vector_store(
        name=VECTOR_STORE_NAME,
        description=VECTOR_STORE_DESCRIPTION,
    )

vector_store_id = vector_store.id
print(f"Using vector_store_id: {vector_store_id}")


Using existing vector store from .env
  Name: Course outlines
  ID:   vs_696938b687f88191a79aa6b70c012dbb
Using vector_store_id: vs_696938b687f88191a79aa6b70c012dbb


### Reading the number of vector stores

In [5]:
_ = list_vector_stores()


Total vector stores returned: 1
vs_696938b687f88191a79aa6b70c012dbb | Course outlines | created_at=1768503478


### Uploading and attaching file(s) to vector store

In [6]:
FILE_PATHS = [
    # Add one or more paths to files you want to upload
    # Example: r"course_outlines\\1.docx",
]

if not FILE_PATHS:
    print("No FILE_PATHS set. Skipping upload.")
else:
    uploaded_file_ids: list[str] = []

    for p in FILE_PATHS:
        path = Path(p)
        if not path.exists():
            raise FileNotFoundError(f"File not found: {path}")

        with path.open("rb") as file_data:
            f = client.files.create(file=file_data, purpose="assistants")

        client.vector_stores.files.create(vector_store_id=vector_store_id, file_id=f.id)
        uploaded_file_ids.append(f.id)
        print(f"Uploaded and attached: {path.name} (file_id={f.id})")

    print(f"Total uploaded in this run: {len(uploaded_file_ids)}")


No FILE_PATHS set. Skipping upload.


### Checking the number of files attached to vector store

In [7]:
files = client.vector_stores.files.list(vector_store_id=vector_store_id)
print(f"Total files in vector store: {len(files.data)}")

if files.data:
    for f in files.data:
        print(f"{f.id} | status={f.status}")


Total files in vector store: 3
file-P2iAjjEcbztVSW3rwWLLov | status=completed
file-5GBLTfn9rwronU6HCYKkdu | status=completed
file-L3icTrkHxC5bJRs8m9KK1N | status=completed


In [None]:
source_dir = Path("course_outlines")

if not source_dir.exists() or not source_dir.is_dir():
    raise FileNotFoundError(f"Directory not found: {source_dir.resolve()}")

files_before = client.vector_stores.files.list(vector_store_id=vector_store_id)
print(f"Vector store files before: {len(files_before.data)}")

for f in files_before.data:
    client.vector_stores.files.delete(vector_store_id=vector_store_id, file_id=f.id)

files_after_clear = client.vector_stores.files.list(vector_store_id=vector_store_id)
print(f"Vector store files after clear: {len(files_after_clear.data)}")

paths = [p for p in source_dir.rglob("*") if p.is_file()]
print(f"Local files found: {len(paths)}")

uploaded = 0
for path in paths:
    with path.open("rb") as file_data:
        created = client.files.create(file=file_data, purpose="assistants")
    client.vector_stores.files.create(vector_store_id=vector_store_id, file_id=created.id)
    uploaded += 1
    print(f"Attached: {path} | file_id={created.id}")

files_final = client.vector_stores.files.list(vector_store_id=vector_store_id)
print(f"Vector store files after re-add: {len(files_final.data)}")
print(f"Uploaded in this run: {uploaded}")


### Searching the vector store

In [8]:
QUERY = ""  # set a query before running

if not QUERY.strip():
    print("QUERY is empty. Set QUERY to a non-empty string to run a search.")
else:
    results = client.vector_stores.search(vector_store_id=vector_store_id, query=QUERY)
    print(f"Results: {len(results.data)}")

    for i, result in enumerate(results.data, start=1):
        print(f"Result {i} | score={result.score}")
        if result.content and getattr(result.content[0], "text", None):
            print(result.content[0].text)
        print("-" * 50)


QUERY is empty. Set QUERY to a non-empty string to run a search.


In [9]:
# This cell is intentionally left blank.
