# Resources
* [AI-Scientist](https://github.com/SakanaAI/AI-Scientist )
* [LangChain + Neo4j GRG Tutorial](https://python.langchain.com/docs/tutorials/graph/)
* [Enhancing RAG-based application accuracy by constructing and leveraging knowledge graphs](https://blog.langchain.dev/enhancing-rag-based-applications-accuracy-by-constructing-and-leveraging-knowledge-graphs/)

# Preliminaries + Installs

These instructions are for Python 3.10
### Install Notebook 04 Dependencies!
* `sudo apt install docker.io`
* `sudo chmod 666 /var/run/docker.sock`
* `python3.10 -m pip install docker --user`
### Install ArangoDB + Docker Container
* `sudo docker pull arangodb`
* `python3.10 -m pip install python-arango adb-cloud-connector --user`



# Inspiration
* [Co-STORM @ Stanford](https://storm.genie.stanford.edu/)
* [Ellicit](https://elicit.com/)
* [Research Rabbit](https://www.researchrabbit.ai/)

# Init + Env

In [1]:
########## INIT ####################################################################################
import os
from os import path, makedirs, environ
from utils import copy_pdfs



########## ENVIRONMENT #############################################################################

##### 04: Basic RAG #######################################################
environ["_RAG_DOC_DBASE"]  = "lit_pdf"
environ["_RAG_DOC_EMBED"]  = "all-minilm"

##### 05: Graph-RAG (GRG) #################################################
environ["_GRG_MODEL_NAME"] = "llama3.2-vision"
environ["_GRG_EMBED_NAME"] = "all-minilm"

environ["_GRG_GRAPH_DB"] = "grg_rel"

##### Flags ###############################################################
_LINK_PAGES = True


##### Files ###############################################################
_PAGE_LINKS = "data/PageLinksDONE.txt"
_DOC_EMBEDS = "data/DocVectors.pkl"

### You may need to manually tune these parameters

In [2]:
_SIM_MAX         =  0.7329191963152777
_SIM_MIN         = -0.24439346017677427
_MAX_BRANCH      = 10
_PAGE_CSN_FRAC   =  0.60 # 1.5% of all possible links
_PAGE_CSN_THRESH = (_SIM_MAX-_SIM_MIN) * _PAGE_CSN_FRAC + _SIM_MIN

# Depth 1: Link PDF Pages by Similarity

## Retrieve 04 Embeddings

In [3]:
%%capture
# Loading the vector store sometimes spews warnings
import sys

__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop( 'pysqlite3' )
import chromadb
from langchain_chroma import Chroma


persistent_client = chromadb.PersistentClient();
collection        = persistent_client.get_or_create_collection( environ["_RAG_DOC_DBASE"] );

In [4]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

vecPairs = None

if _LINK_PAGES and os.path.isfile( _DOC_EMBEDS ):
    with open( _DOC_EMBEDS, 'rb' ) as f:
        vecPairs = pickle.load( f )
    print( f"Got {len( vecPairs )} vectors!" )

Got 213725 vectors!


In [5]:
from langchain_ollama import OllamaEmbeddings

environ["OLLAMA_NUM_PARALLEL"]      = "8"
environ["OLLAMA_MAX_LOADED_MODELS"] = "8"

from utils import pull_ollama_model

pull_ollama_model( environ["_RAG_DOC_EMBED"] )

local_embeddings = OllamaEmbeddings( model = environ["_RAG_DOC_EMBED"] )

About to save 'all-minilm'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
pulling 797b70c4edf8... 100% ▕████████████████▏  45 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling 85011998c600... 100% ▕████████████████▏   16 B                         
pulling 548455b72658... 100% ▕████████████████▏  407 B                         
verifying sha256 digest 
writing manifest 
success [?25h


## Recalculate Embeddings (NOT exposed by ChromaDB!)

In [6]:
# NOTE: If you don’t have enough VRAM it will use the CPU. 

import time, os
now = time.time
from collections import deque

import numpy as np



if _LINK_PAGES and (not os.path.isfile( _DOC_EMBEDS )):

    vecPairs = deque()
    docBatch = 1

    # Iterate over all documents and collect the IDs
    all_ids = deque()
    allData = collection.get()
    totDocs = allData['documents']
    totIDs  = allData['ids']
    print( f"Fetched {len(totDocs)} documents" )
    
    tBgn = now()
    N    = len( totDocs )
    bgn  = 0
    end  = 0
    # for i, doc in enumerate( totDocs ):
    while bgn < N:
        end = min( bgn+docBatch, N )
        try:
            vec = local_embeddings.embed_documents( totDocs[ bgn:end ] )
            for i in range( bgn, end ):
                vecPairs.append( {'vec' : np.array( vec[i-bgn] ), 'doc': totDocs[i], 'id' : totIDs[i]} )
                if ((i+1)%100) == 0:
                    print('.',end='',flush=True)
                if ((i+1)%10000) == 0:
                    m,s = divmod( now()-tBgn, 60 )
                    print(f"\n{i+1},{int(m)}:{s:.2f}",end=' ',flush=True)
            bgn = end
        except Exception as e:
            print(e,end=', ',flush=True)
            bgn += 1
    print( f"\nPage embedding recalc took {(now()-tBgn)/60.0:.2f} minutes!" )

    vecPairs = list( vecPairs )
    print( f"Got {len( vecPairs )} vectors!" )

In [7]:
if _LINK_PAGES and (not os.path.isfile( _PAGE_LINKS )):
    simMin = 1e9
    simMax = 0.0
    print( len(vecPairs) )
    vectrs = [item['vec'] for item in vecPairs]
    vec0   = vectrs[0]
    
    for vec_i in vectrs[1:]:
        sim_i  = cosine_similarity( [vec0, vec_i,] )[0,1]
        simMin = min( sim_i, simMin )
        simMax = max( sim_i, simMax )
    
    diffSpan = simMax - simMin
    
    print( [simMin, simMax,] )

In [8]:
if _LINK_PAGES and (not os.path.isfile( _DOC_EMBEDS )):
    with open( _DOC_EMBEDS, 'wb' ) as f:
        pickle.dump( vecPairs, f )

## Calculate Page Similarity

In [9]:


if _LINK_PAGES and (not os.path.isfile( _PAGE_LINKS )):
    
    Ndocs     = len( vecPairs )
    tBgn      = now()

    Nchunk = 500

    bgn1 = 0
    end1 = 0
    bgn2 = 0
    end2 = 0

    def attempt_connect( pair_i, pair_j ):
        sim_ij = cosine_similarity( [pair_i['vec'], pair_j['vec'],] )[0,1]
        if sim_ij >= _PAGE_CSN_THRESH:
            return {
                'type'   : "Page_Cosine_Similarity",
                'idTail' : pair_i['id'],
                'idHead' : pair_j['id'],
                'dir'    : False,
                'coSim'  : sim_ij,
            }
        else:
            return None

    while bgn1 < Ndocs:

        pageLinks = deque()

        ## Define Chunks 1 & 2 ##
        end1 = min( bgn1+Nchunk, Ndocs )
        bgn2 = end1 if (end1 < Ndocs) else 0 # Chunk 2 wraps at the end
        end2 = min( bgn2+Nchunk, Ndocs )

        chnk1 = vecPairs[ bgn1:end1 ]
        chnk2 = vecPairs[ bgn2:end2 ]

        N1 = end1 - bgn1
        N2 = end2 - bgn2

        ## Connections between Chunks 1 & 2 ##
        for i in range( N1 ):
            if ((i+1)%10==0):
                print('.',end='',flush=True)
            pair_i = chnk1[i]
            Nconn  = 0
            for j in range( N2 ):
                pair_j = chnk2[j]
                res_ij = attempt_connect( pair_i, pair_j )
                if res_ij is not None:
                    pageLinks.append( res_ij )
                    Nconn += 1
                    if Nconn >= _MAX_BRANCH:
                        break

        ## Connections within Chunk 1 ##
        for i in range( N1-1 ):
            if ((i+1)%10==0):
                print('~',end='',flush=True)
            pair_i = chnk1[i]
            Nconn  = 0
            for j in range( i+1, N1 ):
                pair_j = chnk1[j]
                res_ij = attempt_connect( pair_i, pair_j )
                if res_ij is not None:
                    pageLinks.append( res_ij )
                    Nconn += 1
                    if Nconn >= _MAX_BRANCH:
                        break

        pageLinks = list( pageLinks )
        with open( f"data/PageLinks_{bgn1}-{end2}.pkl", 'wb' ) as f:
            pickle.dump( pageLinks, f )
        
        m,s = divmod( now()-tBgn, 60 )
        print(f"\n{bgn1}:{end1}/{bgn2}:{end2}, {int(m)}:{s:.2f}, {len(pageLinks)},",end=' ',flush=True)
        bgn1 = end1

    os.system( f"touch {_PAGE_LINKS}" )
    print( f"\nBuilt page graph in {(now()-tBgn)/60.0/60.0:.2f} hours!" )
    print()

    
            

## Build Page Graph @ ArangoDB

### Build Page Collection

### Build Page Graph

In [10]:
# print( f"There are {len(pageLinks)} connections between pages!" )

# Depth 2: Link Passages by Similarity

In [11]:
_GEN_PASSAGES = True
_PASSAGE_FNAM = "data/Passages.pkl"
_PSG_DIST_DIV = 1.41 # Lower number, Few Segments

## Segment Pages Into Passages

In [12]:
from pprint import pprint

def page_to_sentences( pageText ):
    """ Parse the page into individual sentences """
    rtnParts = deque()
    pageText = fr"{pageText}" + '.' # Terminator hack
    sepChars = ['?','!','\n']
    sepPhras = ['. ',]
    sentence = ""
    word     = ""

    # print( len(pageText) )
    # pprint( pageText )

    def push_chunk():
        nonlocal rtnParts, sentence, word
        rtnParts.append( sentence )
        sentence = ""
        word     = ""
    
    for c in pageText:
        sentence += c # Include punctuation in the sentence
        word     += c
        if (c in sepChars) or (word in sepPhras):
            push_chunk()

    chunks = list( rtnParts )
    chunks = [str( part ).strip() for part in chunks]

    # print( chunks )

    return chunks 

In [13]:
from utils import gen_ID

def sentences_to_passages( chunkList, embedder, parentID, segDiv = 10.0 ):
    """ Segment a list of sentences into a passage """
    vectors = embedder.embed_documents( chunkList )
    senVecs = zip( vectors, chunkList )
    rtnSeg  = deque()

    def vec_diff( v1, v2 ):
        """ Distance between 2 pnts """
        return np.linalg.norm( np.subtract( v1, v2 ) )

    def get_total_width():
        nonlocal vectors
        N    = len( vectors )
        dMax = -1.0
        for i in range( N-1 ):
            for j in range( i+1, N ):
                dMax = max( dMax, vec_diff( vectors[i], vectors[j] ) )
        return dMax

    segRad = get_total_width() / segDiv
    
    vecs_j = list()
    mean_j = None
    txtP_j = ""

    for i, (vec_i, sen_i) in enumerate( senVecs ):

        if i == 0:
            mean_j = vec_i

        if vec_diff( vec_i, mean_j ) > segRad:
            rtnSeg.append({
                'id'    : gen_ID(),
                'vec'   : embedder.embed_documents( [txtP_j,] )[0], # Embedding might be different than the average?
                'txt'   : txtP_j,
                'pageID': str( parentID ),
            })
            vecs_j = [vec_i,]
            mean_j = vec_i
            txtP_j = sen_i
        else:
            vecs_j.append( vec_i )
            mean_j = np.mean( vecs_j, axis = 0 )
            txtP_j += ' ' + sen_i # Reinsert leading space

    return list( rtnSeg )
        

In [14]:
vecPairs = list()
if _GEN_PASSAGES and os.path.isfile( _DOC_EMBEDS ):
    with open( _DOC_EMBEDS, 'rb' ) as f:
        vecPairs = pickle.load( f )
    print( f"Got {len( vecPairs )} vectors!" )

Got 213725 vectors!


In [1]:
from pprint import pprint

# 2024-11-11: THIS IS A 30 DAY CALCULATION!!!  3.6 HOURS PER 10k PAGES!  RUN JOBS AS AVAILABLE!
# 2024-11-12: Various sources claim Ollama handles parallel processing?  Is it thread safe?  ATTEMPT AND SEE!!

def embed_large_passage_collection( manyVecPairs, fragmentID = None ):
    """ Attempt to embed a large collection of documents """
    # NOTE: Collection should be large enough to justify loading a new instance of the embedding model!
    print( f"\nBegin processing passages @ {os.getpid()}!" )
    lclEmbd  = OllamaEmbeddings( model = environ["_RAG_DOC_EMBED"] )
    passages = deque()
    tBgn     = now()

    if fragmentID is None:
        fragmentID = gen_ID()
    
    for i, pair_i in enumerate ( manyVecPairs ):
        try:
            dID_i = pair_i['id' ]
            doc_i = pair_i['doc']
            vec_i = pair_i['vec']

            chunks_i = page_to_sentences( doc_i )
            passgs_i = sentences_to_passages( chunks_i, lclEmbd, dID_i, segDiv = _PSG_DIST_DIV ) 
            passages.extend( passgs_i )

        except Exception as e:
            print( f"\nBAD THING @ {os.getpid()}: {e}\n" )
            
    print( f"\nFINISHED processing passages @ {os.getpid()}!" )
    return list( passages )

# FIXME, START HERE: NEED A POOL OF WORKERS

# FIXME: NEED A SEMAPHOR FOR WRITING TO RAG STATE?

# FIXME: IS THERE A WAY TO STOP PROCESSING AND STILL PRESERVE STATE?

# FIXME: PICK UP FROM LAST 10k INDEX @ RAGSTATE



## Link Passages to Parent Pages

## For each passage, Gather pages from N hops

In [None]:
# for result in collection.get()['documents']:
# # for result in collection.get()['ids']: # 6f83d661-6cdf-46bf-83a4-27f6c36d948f
#     # print( result )
#     print( dir( result ) )
#     break
#     # for res in result['ids']:
#     #     all_ids.append( res )
# # docIDs = list( all_ids )
# # print( f"There are {len(docIDs)} documents!" )
# # print( docIDs[0] )

## Calc ranked passage similarity from those pages, Create up to M connections per passage
### (This is a separate collection from page links)

## Build Passage Graph @ ArangoDB

### Build Passage Collection

### Build Passage Graph

# Create Retriever

## What decisions does the retriever have to make?
* Following connections
* Ranking pages and passages
* Stitching passages in proper order.
    - What is proper order?

## What is our token budget for the LLM summary?

## What would it look like to extend the budget with overlapping summaries?

# Create Summarizer

## What would it look like for the summarizer to extend the token budget with Chain of Thought "Reasoning"?

# Depth 3: Build Statements (Assumptions and Claims) about Keywords

## Segment Keywords
* Statistically important/rare phrases
* Ask LLM to isolate {jargon, technical terms}

## Build Statements (Assumptions and Claims)
* S-V-O sentence-by-sentence

# Depth 4: Support/Refute Statements (Assumptions and Claims)

## How to determine support?  How to determine contradiction?

## How to weigh support/contradiction based on our level of trust in existing documents?

# Advanced Knowledge Graph Queries

## Statements: What is the difference between an Assumption and a Claim?

## What assumptions do we trust?

## What claims are supported by assumptions we trust?

## What questions are being asked?

## What questions are being answered?

## Does the document create new trustworthy connections?

## Can we find a similar subgraph in a different field of research?

## What questions CAN be asked based on the movement of passages and concepts through vector space?

## Can we track trajectories in vector space?

## Can we PREDICT trajectories in vector space?

# KNNEST: Knowledge Graph Structure Notes
* How to know sources are in different fields? ~ Cluster embeddings?
* New heirarchical embedding per field?
* Do embeddings need to be compressed by PCA?

# ANTs: Search Agent Swarm

## Can we TRAIN an agent to make advantageous traversals on the KG based on vector space deltas?

## Swarm-Level Load Management
* Agent Instantiation Condition(s)
* Agent Deletion Condition(s)
* Task Start Condition(s)
* Task Stop Condition(s)

## ANT Decision-Making Architecture
* Resource alotment: {Time, Compute}
* Critical: What edge to follow?
* A strong line of reasoning can be a demonstration trajectory? LfD?
* Inverse-RL to produce an evaluation function for trajectories thru the graph

# Connect to Graph Database

In [None]:
# from time import sleep

# import subprocess

# def start_arango_container():
#     command = [ "docker", "run", "-p", 
#                 "8529:8529", "-e", "ARANGO_ROOT_PASSWORD=", "arangodb/arangodb"]  
#     subprocess.Popen( command )

# start_arango_container()
# sleep( 15.0 )

In [None]:
# # Instantiate ArangoDB Database
# import json

# from adb_cloud_connector import get_temp_credentials
# from arango import ArangoClient

# con = get_temp_credentials()

# db = ArangoClient(hosts=con["url"]).db(
#     con["dbName"], con["username"], con["password"], verify=True
# )

# print(json.dumps(con, indent=2))

In [None]:
# # Instantiate the ArangoDB-LangChain Graph
# from langchain_community.graphs import ArangoGraph

# graph = ArangoGraph( db )

In [None]:
# if not db.has_graph( environ["_GRG_GRAPH_DB"] ):
#     db.create_graph(
#         environ["_GRG_GRAPH_DB"],
#         edge_definitions=[
#             {
#                 "from_vertex_collections": ["subjects"],
#                 "edge_collection": "verbs",
#                 "to_vertex_collections": ["subjects"],
#             },
#         ],
#     )

In [None]:
# import os

# import docker

# client     = docker.from_env()
# containers = client.containers.list()

# for container in containers:
#     print(container.name, container.short_id, container.status)

# os.system( "docker stop 8ed55910d8cc" )

# Wouldn't it be cool if ...
* A manifold (high-order hypersurface) could be fit to the motion of human knowledge
* We could trace paths on that manifold
* Given a position and curvature (+jerk+snap+crackle+pop), we can extrapolate motion beyond the edge of the mapped manifold