In [1]:
%load_ext pycozo.ipyext_direct

*****

### Gotchas

- never add a default value to created_at fields. All default values are treated as being computed on EVERY insert
- basically just DO NOT use default values in cozo -- unless you know what you're doing
- normalize as much as possible
- use time travel sparingly and add it AFTERWARD when deemed necessary coz it's so easy to add
- keep keys as small as possible, remember only the ENTIRE key is treated as the primary key
- almost NEVER use default values for uuids
- although, sometimes default values are okay when used in the key part
- default values for Validity is great as `[floor(now()), true]`
- make sure to check for `is_nan` when comparing similarities coz zero vectors give `NAN`
- for one to many relations, use `parent_id, item_id` as key prefix
- for many to many or polyglot, use association tables only
- optimize lookup via simple indices LATER
- a good schema should ideally not need them

*****

### Docs



- These are documents that can belong to either a user or an agent (future: or a session)
- Retrieved according to scope
- identified by doc_id
- no time travel
- key = doc id

- use association tables for fetching
- use jobs table for associating bg jobs

**Note**:
- Since these are going to be split up, "updating" is not allowed
- to "update", let the user edit in frontend but to commit, add new doc THEN remove old doc

#### Snippets
- snippet is split at `\n\n`
- key = doc id + snippet id
- (Async) Embedded with multiple different variants (list of embeddings)
- doc_id 
- hnsw index over all embeddings
- fts
- use higher ef construction for these and extend candidates for higher accuracy

#### Jobs (Docsrmation)
- table that keeps track of jobs added for additional documents processing
- this table is NOT supposed to track state
- the external system is responsible for it

In [2]:
:create docs {
    doc_id: Uuid,
    =>
    title: String,
    content: String,
    created_at: Float,
}

  return res.style.applymap(_colour_code_type)


Unnamed: 0,status
0,OK


In [3]:
:create docs_jobs {
    doc_id: Uuid,
    job_id: Uuid default random_uuid_v4(),  # normally shouldnt be supplying this
    =>
    metadata: Json,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [4]:
:create docs_snippets {
    doc_id: Uuid,
    snippet_id: Uuid default random_uuid_v4(),  # normally shouldnt be supplying this
    =>
    title: String,
    content: String,

    embeddings: [<F32; 768>],
    embedding_instructions: [String],
}

Unnamed: 0,status
0,OK


In [5]:
::hnsw create docs_snippets:embedding_space {
    dim: 768,
    m: 50,
    dtype: F32,
    fields: embeddings,
    filter: length(embeddings) > 0,
    distance: Cosine,
    ef_construction: 50,  # 20,
    extend_candidates: true,  # false,
    keep_pruned_connections: false,
}

Unnamed: 0,status
0,OK


In [6]:
::fts create docs_snippets:content {
    extractor: title + content,
    tokenizer: Simple,
    filters: [AsciiFolding, AlphaNumOnly, Lowercase, Stemmer('english'), Stopwords('en')],
}

Unnamed: 0,status
0,OK


*****

### Agents



- agent_id
- POSTPONED time travel
- instructions and tools stored separately

#### agent jobs
- should store task ids of workflows that process instructions etc

In [7]:
:create agents {
    agent_id: Uuid,
    # updated_at: Validity default [floor(now()), true],
    =>
    name: String,
    about: String,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [8]:
:create agent_jobs {
    agent_id: Uuid,
    job_id: Uuid default random_uuid_v4(),
    =>
    metadata: Json,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [9]:
:create agent_docsrmation {
    agent_id: Uuid,
    doc_id: Uuid,
}

Unnamed: 0,status
0,OK


*****

### Instructions



- embedding is calculated asynchronously and then updated

In [10]:
:create instructions {
    agent_id: Uuid,
    instruction_id: Uuid default random_uuid_v4(),
    =>
    content: String,
    important: Bool,
    embedding: <F32; 768>?,
    embedding_instruction: String default "Embed this historical text chunk for retrieval: "
}

Unnamed: 0,status
0,OK


In [11]:
::hnsw create instructions:embedding_space {
    dim: 768,
    m: 50,
    dtype: F32,
    fields: embedding,
    filter: !is_null(embedding),
    distance: Cosine,
    ef_construction: 20,
    extend_candidates: false,
    keep_pruned_connections: false,
}

Unnamed: 0,status
0,OK


*****

### Tools



- embedding is calculated asynchronously and then updated

In [12]:
:create tools {
    agent_id: Uuid,
    name: String,
    =>
    description: String,
    parameters: Json,
    created_at: Float,
    embedding: <F32; 768>?,
    embedding_instruction: String default "Transform this tool description for retrieval: "
}

Unnamed: 0,status
0,OK


In [13]:
::hnsw create tools:embedding_space {
    dim: 768,
    m: 50,
    dtype: F32,
    fields: embedding,
    filter: !is_null(embedding),
    distance: Cosine,
    ef_construction: 20,
    extend_candidates: false,
    keep_pruned_connections: false,
}

Unnamed: 0,status
0,OK


*****

### Cache



- omitted proximity search since ANN before every inference is def too wasteful
- use xxhash64 on chatml
- may choose to use relaxed or no filter on params for perf

In [14]:
:create lm_cache {
    chatml_xxhash64: String,
    model: String,
    params: Json,
    =>
    chatml: Json,
    response: Json,
}

Unnamed: 0,status
0,OK


*****

### Users

In [15]:
:create users {
    user_id: Uuid,
    # updated_at: Validity default [floor(now()), true],
    =>
    name: String,
    about: String,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [16]:
:create user_jobs {
    user_id: Uuid,
    job_id: Uuid default random_uuid_v4(),
    =>
    metadata: Json,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [17]:
:create user_docsrmation {
    user_id: Uuid,
    doc_id: Uuid,
}

Unnamed: 0,status
0,OK


*****

### Sessions



- entries are stored in their own relation
- should update summary after every run
- metadata should contain workflow id for summary jobs

In [18]:
:create sessions {
    session_id: Uuid,
    updated_at: Validity default [floor(now()), true],
    =>
    situation: String,
    summary: String?,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [19]:
:create session_lookup {
    agent_id: Uuid,
    user_id: Uuid? default null,
    session_id: Uuid,
}

Unnamed: 0,status
0,OK


In [20]:
:create session_jobs {
    session_id: Uuid,
    job_id: Uuid default random_uuid_v4(),
    =>
    metadata: Json,
    created_at: Float,
}


Unnamed: 0,status
0,OK


In [33]:
:create session_episodes {
    session_id: Uuid,
    episode_id: Uuid,
}

  return res.style.applymap(_colour_code_type)


Unnamed: 0,status
0,OK


*****

### Entries



- entries are building blocks of context
- session has many entries
- entry may be produced via different sources:
  + api_user: original message sent by the api_user
  + summarizer: produced as a summary of one or more other entries
  + memory_access: produced as a part of the `retrieval phase`
- follow the chatml format
- relationships between entries in `entry_relations` table

In [21]:
:create entries {
    session_id: Uuid,
    entry_id: Uuid default random_uuid_v4(),
    source: String,
    role: String,
    name: String?,
    =>
    content: String,
    token_count: Int,
    tokenizer: String,
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [22]:
:create entry_relations {
    head: Uuid,
    op: String,
    tail: Uuid,
    # =>
    # weight: Float,
}

Unnamed: 0,status
0,OK


*****

### Beliefs

In [23]:
:create beliefs {
    belief_id: Uuid,
    =>
    belief: String,
    valence: Float,
    emotion: String,
    rationale: String?,
    embeddings: [<F32; 768>],
    embedding_instructions: [String],
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [24]:
:create belief_lookup {
    agent_id: Uuid,
    user_id: Uuid? default null,
    belief_id: Uuid,
}

Unnamed: 0,status
0,OK


In [25]:
::hnsw create beliefs:embedding_space {
    dim: 768,
    m: 50,
    dtype: F32,
    fields: embeddings,
    filter: length(embeddings) > 0,
    distance: Cosine,
    ef_construction: 50,  # 20,
    extend_candidates: true,  # false,
    keep_pruned_connections: false,
}

Unnamed: 0,status
0,OK


*****

### Models

In [26]:
:create models {
    model_name: String,
    tokenizer_name: String,
    max_length: Int,
    =>
    default_settings: Json default {},
}

Unnamed: 0,status
0,OK


*****

### Episodes

In [27]:
:create episodes {
    episode_id: Uuid,
    =>
    summary: String,
    happened_at: Float,
    last_accessed_at: Float default now(),  # Special exception to the rule
    weight: Float,
    embeddings: [<F32; 768>],
    embedding_instructions: [String],
    created_at: Float,
}

Unnamed: 0,status
0,OK


In [28]:
:create episode_lookup {
    agent_id: Uuid,
    user_id: Uuid? default null,
    episode_id: Uuid,
}

Unnamed: 0,status
0,OK


In [29]:
:create episode_beliefs {
    episode_id: Uuid,
    belief_id: Uuid,
}

Unnamed: 0,status
0,OK


In [30]:
::hnsw create episodes:embedding_space {
    dim: 768,
    m: 50,
    dtype: F32,
    fields: embeddings,
    filter: length(embeddings) > 0,
    distance: Cosine,
    ef_construction: 20,
    extend_candidates: false,
    keep_pruned_connections: false,
}

Unnamed: 0,status
0,OK


In [31]:
::fts create episodes:summary {
    extractor: summary,
    tokenizer: Simple,
    filters: [AsciiFolding, AlphaNumOnly, Lowercase, Stemmer('english'), Stopwords('en')],
}

Unnamed: 0,status
0,OK


*****

In [32]:
%%py
raise 1 // 0

ZeroDivisionError: integer division or modulo by zero