# One Knowledge Base to Rule Them All Workflow

## 0. Setup

In [52]:
import requests


def get_auth_headers(email: str, password: str) -> dict[str, str]:
    """Get auth headers for the selected user."""
    supabase_auth_url = "https://sb.stack-ai.com"
    anon_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZic3VhZGZxaGtseG9rbWxodHNkIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NzM0NTg5ODAsImV4cCI6MTk4OTAzNDk4MH0.Xjry9m7oc42_MsLRc1bZhTTzip3srDjJ6fJMkwhXQ9s"

    request_url = f"{supabase_auth_url}/auth/v1/token?grant_type=password"
    response = requests.post(
        request_url,
        json={
            "email": email,
            "password": password,
            "gotrue_meta_security": {},
        },
        headers={
            "Content-Type": "application/json",
            "Apikey": anon_key,
        },
        timeout=10,
    )
    response.raise_for_status()
    access_token = response.json()["access_token"]

    headers = {"Authorization": f"Bearer {access_token}"}

    return headers

### Login to your account to get your auth headers

In [53]:
email = "stackaitest@gmail.com"
password = input(f"Introduce the password for {email}: ") # !z4ZnxkyLYs#vR

auth_headers = get_auth_headers(email, password)

### Create a request session

In [54]:
session = requests.Session()

session.headers.update(auth_headers)

print(session.headers)

{'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate, br, zstd', 'Accept': '*/*', 'Connection': 'keep-alive', 'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsImtpZCI6Inp6T0EwRk81elM1TEJWeEgiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJodHRwczovL2Zic3VhZGZxaGtseG9rbWxodHNkLnN1cGFiYXNlLmNvL2F1dGgvdjEiLCJzdWIiOiJkMGNlMTgyOC1lZjI2LTRjZjYtOGQwNS04NGUwNDA2Y2MyN2QiLCJhdWQiOiJhdXRoZW50aWNhdGVkIiwiZXhwIjoxNzYwNjU0NDY3LCJpYXQiOjE3NjA1NjQ0NjcsImVtYWlsIjoic3RhY2thaXRlc3RAZ21haWwuY29tIiwicGhvbmUiOiIiLCJhcHBfbWV0YWRhdGEiOnsicHJvdmlkZXIiOiJlbWFpbCIsInByb3ZpZGVycyI6WyJlbWFpbCIsImdvb2dsZSJdfSwidXNlcl9tZXRhZGF0YSI6eyJhdmF0YXJfdXJsIjoiaHR0cHM6Ly9saDMuZ29vZ2xldXNlcmNvbnRlbnQuY29tL2EvQUNnOG9jSlJqYzFzYWk4NnkyYWNfZGFqSDRWUlB0dlhXbkZtRnV4UnJuRXdQelVuZmxXWjNBPXM5Ni1jIiwiZW1haWwiOiJzdGFja2FpdGVzdEBnbWFpbC5jb20iLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiZnVsbF9uYW1lIjoiQUkgcmVjb3JkaW5nIiwiaXNzIjoiaHR0cHM6Ly9hY2NvdW50cy5nb29nbGUuY29tIiwibmFtZSI6IkFJIHJlY29yZGluZyIsInBob25lX3ZlcmlmaWVkIjpmYWxzZSwicGljdHVyZSI6Imh0dHBzOi8v

### Set the correct url for the backend you want to use

In [55]:
backend_url = "https://api.stack-ai.com"

In [56]:
org_id = session.get(f"{backend_url}/organizations/me/current").json()["org_id"]

## 1. Connections

### 1.1 List all the connections for the selected user

Your newly created connection will be listed here

In [57]:
connection_list_url = f"{backend_url}/connections?connection_provider=gdrive&limit=1"
response = session.get(connection_list_url)

response.raise_for_status()

connection = response.json()[0]

In [58]:
print("Connection information:")
print("----------------------")
print(f"Connection ID: {connection['connection_id']}")
print(f"Connection name: {connection['name']}")
print(f"Created at: {connection['created_at']}")
print(f"Updated at: {connection['updated_at']}")

# Commented to avoid leaking sensitive information
# print(f"Connection provider: {connection['connection_provider_data']}")

Connection information:
----------------------
Connection ID: 96891794-4313-42f1-9d98-237e526165b8
Connection name: Google Drive
Created at: 2025-06-19T02:28:05.881189+00:00
Updated at: 2025-09-11T11:41:52.532801+00:00


> If you don't see any connection above, then do the following:
> 1. Go to the Stack AI dashboard (you have to log in with the given credentials)
> 2. On the left sidebar, click on Connections
> 3. Create a Google Drive connection (you have to use the given credentials)
> 4. Click on connect to Google Drive on the node and follow the authorization steps.

### 1.2 List available resources under the connection

In [59]:
connection_id = connection["connection_id"]
connection_resources_url = f"{backend_url}/connections/{connection_id}/resources"
children_resources_url = f"{backend_url}/connections/{connection_id}/resources/children"


**Root resources** 

Lets start with the root resources, to do it, we should not specify a path, so we will get all the resources in the connection.

In [60]:
print("Pinging: ", children_resources_url)
root_resources_response = session.get(children_resources_url)

root_resources_response.raise_for_status()

root_resources = root_resources_response.json()

for resource in root_resources["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']})")

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children
📁 acme                           (resource_id: 1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf)
📁 books                          (resource_id: 14fYmcLp_T4jV5bSL2CZiezOY6a6vGW0d)
📁 clients                        (resource_id: 1xPb4DwqeCcsppG7qAULiiXugOvovMX70)
📁 projects                       (resource_id: 10aWbQss04gCh9kJCuxYCYteyXnsd6Nzu)
📁 references                     (resource_id: 1dKlaJIxEb9ENFEufLutmsT7QqsfOwYpm)
📄 Copy of ACME_Earnings_Report_Q2_2024.pdf (resource_id: 1TJ3bD80IYFixw4Col514XxquZrDzFiPr)
📄 Copy of ACME_Inc_Customer_Data.csv (resource_id: 11WIQPhVbGTH3oCynCKt_7wG0_2-06KpG)
📄 Copy of ACME_Information_Security_Policies.pdf (resource_id: 1Rlbkh6yA1VG97Gv1aiBhVeB5Mv6s6Ncm)
📄 Copy of ACME_Investment_Committee_Memo_Q3_2024.pdf (resource_id: 1izCa2twODi8mLZt73-SUQxxC0lA-a6PV)
📄 Copy of ACME_Knowledge_Base_RFP_Responses.pdf (resource_id: 1hv9rdtMa-Uu0xMbigver3mE8hH1hrhUy)
📄 rootfile1.

**Lets take a look at the raw response from the API**

In [61]:
for resource in root_resources:
    print(resource)

data
next_cursor
current_cursor


**Get the information about a specific file, like, 'Very Important notes.txt'**

In [62]:
from urllib.parse import urlencode

data = {"resource_ids": "1GYpHUOiSYXGz_9GeUGgQkwQUJqCAxibGd9szwMJQSIg"}

# Encode the query parameters
encoded_query_params = urlencode(data, doseq=True)
url = f"{connection_resources_url}?{encoded_query_params}"

print("Pinging: ", url)
resource = session.get(url)

resource.raise_for_status()

print("\n\nRaw response:")
print(resource.text)

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources?resource_ids=1GYpHUOiSYXGz_9GeUGgQkwQUJqCAxibGd9szwMJQSIg


Raw response:
{"1GYpHUOiSYXGz_9GeUGgQkwQUJqCAxibGd9szwMJQSIg":null}


**Get the resources in a directory, like Classes**


In [63]:
from urllib.parse import urlencode

data = {"resource_id": "1GrHAPg2LVnx78y7diTMC_6AVQV1sehk2"}

# Encode the query parameters
encoded_query_params = urlencode(data, doseq=True)
url = f"{children_resources_url}?{encoded_query_params}"

print("Pinging: ", url)
response = session.get(url)

response.raise_for_status()

resources = response.json()

for resource in resources["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']})")

print("\n\nRaw response:")
print(resource)

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children?resource_id=1GrHAPg2LVnx78y7diTMC_6AVQV1sehk2


HTTPError: 404 Client Error: Not Found for url: https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children?resource_id=1GrHAPg2LVnx78y7diTMC_6AVQV1sehk2

**Get the resources in a directory, like classes (nested)**



In [None]:
from urllib.parse import urlencode

data = {"resource_id": "1HPF28wtRZaJpsj9M_BapdBhr3tRa8aUJ"}

# Encode the query parameters
encoded_query_params = urlencode(data, doseq=True)
url = f"{children_resources_url}?{encoded_query_params}"

print("Pinging: ", url)
response = session.get(url)

response.raise_for_status()

resources = response.json()

for resource in resources["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']})")

print("\n\nRaw response:")
print(resource)

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children?resource_id=1HPF28wtRZaJpsj9M_BapdBhr3tRa8aUJ


HTTPError: 404 Client Error: Not Found for url: https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children?resource_id=1HPF28wtRZaJpsj9M_BapdBhr3tRa8aUJ

# 2. Knowledge Bases

Once the user has decided which resources they want to index, they can create a knowledge base. A knowledge base is a collection of resources that are indexed in our vector database. 


In this example, we will suppose that the user has decided to index the following resources:
- 📁 papers                         (resource_id: 1YeS8H92ZmTZ3r2tLn1m43GG58gRzvYiM)
- 📄 Very Important notes.txt       (resource_id: 1GYpHUOiSYXGz_9GeUGgQkwQUJqCAxibGd9szwMJQSIg)


This means that test_folder and all of its subfolders will be indexed as well as the manu_document_awesome.txt file will be indexed.

It is important that the frontend contains logic to avoid passing both a resource and its children in the list of resources to be indexed. For example, if the frontend passses both
- 📁 test_folder                    (resource_id: 1cGeHFazvfHDSOfDJ_SRZEzkm5q1-Zn41)
- 📄 test_folder/Contrato_pagos_inmediatos.pdf (resource_id: 18nr8ZUE0QQZgNITw1JeEV1ZaobMDxUNC)

While the backend will work fine and index everything under test_folder, there will be duplicate work to get the metadata of the Contrato_pagos_inmediatos.pdf file both as a child of test_folder and as an independent resource.

## 2.1 Creating a knowledge base
Lets create a knowledge base that will be synced to the selected resources.

In [64]:
import json

create_kb_url = f"{backend_url}/knowledge_bases"


connection_source_ids = [
    "1YeS8H92ZmTZ3r2tLn1m43GG58gRzvYiM",  # The papers folder
    "1GYpHUOiSYXGz_9GeUGgQkwQUJqCAxibGd9szwMJQSIg",  # Very Important Notes.txt file
]


data = {
    "connection_id": connection_id,
    "connection_source_ids": connection_source_ids,
    "name": "Test Knowledge Base",
    "description": "This is a test knowledge base",
    "indexing_params": {
        "ocr": False,
        "unstructured": True,
        "embedding_params": {"embedding_model": "text-embedding-ada-002", "api_key": None},
        "chunker_params": {"chunk_size": 1500, "chunk_overlap": 500, "chunker": "sentence"},
    },
    "org_level_role": None,
    "cron_job_id": None,
}

print("Pinging: ", create_kb_url)
kb_create_response = session.post(create_kb_url, data=json.dumps(data))

new_kb_json = kb_create_response.json()

Pinging:  https://api.stack-ai.com/knowledge_bases


In [65]:
print(new_kb_json)

knowledge_base_id = new_kb_json["knowledge_base_id"]

{'knowledge_base_id': 'b85fed6b-54b0-493a-b7f5-0c39198a7713', 'connection_id': '96891794-4313-42f1-9d98-237e526165b8', 'created_at': '2025-10-15T21:41:33.066883Z', 'updated_at': '2025-10-15T21:41:33.066892Z', 'connection_source_ids': ['1GYpHUOiSYXGz_9GeUGgQkwQUJqCAxibGd9szwMJQSIg', '1YeS8H92ZmTZ3r2tLn1m43GG58gRzvYiM'], 'website_sources': [], 'connection_provider_type': 'gdrive', 'is_empty': True, 'total_size': 0, 'name': 'Test Knowledge Base', 'description': 'This is a test knowledge base', 'indexing_params': {'ocr': False, 'unstructured': True, 'embedding_params': {'api': None, 'base_url': None, 'embedding_model': 'text-embedding-ada-002', 'provider': None, 'batch_size': 300, 'track_usage': True, 'timeout': 5}, 'chunker_params': {'chunk_size': 1500, 'chunk_overlap': 500, 'chunker_type': 'sentence'}}, 'cron_job_id': None, 'org_id': '0d582f36-52dd-403f-a38a-ccf4dfa06180', 'org_level_role': None, 'user_metadata_schema': None, 'dataloader_metadata_schema': None}


## 2.2 Sync Knowledge Base

To load the resources from the connection into the knowledge base, we need to call the `sync` endpoint of the knowledge base. The syncing will be done on a background task, so we need to wait for the task to finish before we can access the resources.

In [66]:
kb_sync_url = f"{backend_url}/knowledge_bases/sync/trigger/{knowledge_base_id}/{org_id}"

print("Pinging: ", kb_sync_url)
sync_response = session.get(kb_sync_url)

print(sync_response.status_code)
print(sync_response.text)

Pinging:  https://api.stack-ai.com/knowledge_bases/sync/trigger/b85fed6b-54b0-493a-b7f5-0c39198a7713/0d582f36-52dd-403f-a38a-ccf4dfa06180
200
null


## 2.3 Get the list of files in the knowledge base

At first, the files will be in the pending state as their indexing is not yet complete. If you wait for about a minute, you should see the files in the indexed state.


In [67]:
import time

# Wait a few seconds...
time.sleep(5)

In [71]:
from urllib.parse import urlencode

kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources/children"

data = {
    "resource_path": "/",
}

encoded_query_params = urlencode(data)
url = f"{kb_children_resources_url}?{encoded_query_params}"
print("Pinging: ", url)
kb_resources = session.get(url, data=json.dumps(data))


for resource in kb_resources.json()["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(
        f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']}) status: {resource.get('status')}"
    )

Pinging:  https://api.stack-ai.com/knowledge_bases/b85fed6b-54b0-493a-b7f5-0c39198a7713/resources/children?resource_path=%2F


In [70]:
kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources/children"

data = {
    "resource_path": "/papers",
}
encoded_query_params = urlencode(data)
url = f"{kb_children_resources_url}?{encoded_query_params}"
print("Pinging: ", url)
kb_resources = session.get(url, data=json.dumps(data))

kb_resources.raise_for_status()

for resource in kb_resources.json()["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(
        f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']}) status: {resource.get('status')}"
    )

Pinging:  https://api.stack-ai.com/knowledge_bases/b85fed6b-54b0-493a-b7f5-0c39198a7713/resources/children?resource_path=%2Fpapers


HTTPError: 400 Client Error: Bad Request for url: https://api.stack-ai.com/knowledge_bases/b85fed6b-54b0-493a-b7f5-0c39198a7713/resources/children?resource_path=%2Fpapers

## 2.4 Manually manipulate the knowledge base

### Delete a file
For now, only files can be deleted.

In [None]:
kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources"

data = {
    "resource_path": "papers/react_paper.pdf",
}
encoded_query_params = urlencode(data)
response = session.delete(
    f"{kb_children_resources_url}?{encoded_query_params}",
    data=json.dumps(data),
)


print(response.status_code)

In [None]:
import time

# Wait for the deletion to finish
time.sleep(5)

# list the resources again and make sure that the document is gone
kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources/children"

data = {
    "resource_path": "papers/",
}
encoded_query_params = urlencode(data)
kb_resources = session.get(
    f"{kb_children_resources_url}?{encoded_query_params}",
    data=json.dumps(data),
)

for resource in kb_resources.json()["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(
        f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']}) status: {resource.get('status')}"
    )

### Create a file
For now, only files can be created.

In [None]:

# Define the metadata and file content
create_request_metadata = {
        "resource_type": "file",
        "resource_path": "papers/demo_file.txt",
}
file_content = b"test file content"

# Prepare the files dictionary
files = {
    "file": ("file.txt", file_content, "text/plain"),
}

# Make the POST request
response = session.post(
    f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources",
    files=files,
    data=create_request_metadata,  # Use data instead of json for multipart form-data
)

print(response.status_code)

In [None]:

import time

time.sleep(5)

# list the resources again and make sure that the document is gone
kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources/children"

data = {
    "resource_path": "papers/",
}
encoded_query_params = urlencode(data)
kb_resources = session.get(
    f"{kb_children_resources_url}?{encoded_query_params}",
    data=json.dumps(data),
)


for resource in kb_resources.json()["data"]:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(
        f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']}) status: {resource.get('status')}"
    )