# Resources
* [AI-Scientist](https://github.com/SakanaAI/AI-Scientist )
* [LangChain + Neo4j GRG Tutorial](https://python.langchain.com/docs/tutorials/graph/)
* [Enhancing RAG-based application accuracy by constructing and leveraging knowledge graphs](https://blog.langchain.dev/enhancing-rag-based-applications-accuracy-by-constructing-and-leveraging-knowledge-graphs/)

# Preliminaries + Installs

These instructions are for Python 3.10
### Install Notebook 04 Dependencies!
* `sudo apt install docker.io`
* `sudo chmod 666 /var/run/docker.sock`
* `python3.10 -m pip install docker --user`
### Install ArangoDB + Docker Container
* `sudo docker pull arangodb`
* `python3.10 -m pip install python-arango adb-cloud-connector --user`



# Inspiration
* [Co-STORM @ Stanford](https://storm.genie.stanford.edu/)
* [Ellicit](https://elicit.com/)
* [Research Rabbit](https://www.researchrabbit.ai/)

# Init + Env

In [1]:
########## INIT ####################################################################################
import os
from os import path, makedirs, environ
from utils import copy_pdfs



########## ENVIRONMENT #############################################################################

##### 04: Basic RAG #######################################################
environ["_RAG_DOC_DBASE"]  = "lit_pdf"
environ["_RAG_DOC_EMBED"]  = "all-minilm"

##### 05: Graph-RAG (GRG) #################################################
environ["_GRG_MODEL_NAME"] = "llama3.2-vision"
environ["_GRG_EMBED_NAME"] = "all-minilm"

environ["_GRG_GRAPH_DB"] = "grg_rel"

##### Flags ###############################################################
_LINK_PAGES = True
_DOC_EMBEDS = "data/DocVectors.pkl"

# Depth 1: Link PDF Pages by Similarity

## Retrieve 04 Embeddings

In [2]:
%%capture
# Loading the vector store sometimes spews warnings
import sys

__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop( 'pysqlite3' )
import chromadb
from langchain_chroma import Chroma


persistent_client = chromadb.PersistentClient();
collection        = persistent_client.get_or_create_collection( environ["_RAG_DOC_DBASE"] );

In [3]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

vecPairs = None

if _LINK_PAGES and os.path.isfile( _DOC_EMBEDS ):
    with open( _DOC_EMBEDS, 'rb' ) as f:
        vecPairs = pickle.load( f )
    print( f"Got {len( vecPairs )} vectors!" )

Got 213725 vectors!


In [4]:
from langchain_ollama import OllamaEmbeddings

from utils import pull_ollama_model

pull_ollama_model( environ["_RAG_DOC_EMBED"] )

local_embeddings = OllamaEmbeddings( model = environ["_RAG_DOC_EMBED"] )

About to save 'all-minilm'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest 
pulling 797b70c4edf8... 100% ▕████████████████▏  45 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling 85011998c600... 100% ▕████████████████▏   16 B                         
pulling 548455b72658... 100% ▕████████████████▏  407 B                         
verifying sha256 digest 
writing manifest 
success [?25h


## Recalculate Embeddings (NOT exposed by ChromaDB!)

In [5]:
# NOTE: If you don’t have enough VRAM it will use the CPU. 

import time, os
now = time.time
from collections import deque

import numpy as np



if _LINK_PAGES and (not os.path.isfile( _DOC_EMBEDS )):

    vecPairs = deque()
    docBatch = 1

    # Iterate over all documents and collect the IDs
    all_ids = deque()
    allData = collection.get()
    totDocs = allData['documents']
    totIDs  = allData['ids']
    print( f"Fetched {len(totDocs)} documents" )
    
    tBgn = now()
    N    = len( totDocs )
    bgn  = 0
    end  = 0
    # for i, doc in enumerate( totDocs ):
    while bgn < N:
        end = min( bgn+docBatch, N )
        try:
            vec = local_embeddings.embed_documents( totDocs[ bgn:end ] )
            for i in range( bgn, end ):
                vecPairs.append( {'vec' : np.array( vec[i-bgn] ), 'doc': totDocs[i], 'id' : totIDs[i]} )
                if ((i+1)%100) == 0:
                    print('.',end='',flush=True)
                if ((i+1)%10000) == 0:
                    m,s = divmod( now()-tBgn, 60 )
                    print(f"\n{i+1},{int(m)}:{s:.2f}",end=' ',flush=True)
            bgn = end
        except Exception as e:
            print(e,end=', ',flush=True)
            bgn += 1
    print( f"\nPage embedding recalc took {(now()-tBgn)/60.0:.2f} minutes!" )

    vecPairs = list( vecPairs )
    print( f"Got {len( vecPairs )} vectors!" )

In [6]:
if 0:
    simMin = 1e9
    simMax = 0.0
    print( len(vecPairs) )
    vectrs = [item['vec'] for item in vecPairs]
    vec0   = vectrs[0]
    
    for vec_i in vectrs[1:]:
        sim_i  = cosine_similarity( [vec0, vec_i,] )[0,1]
        simMin = min( sim_i, simMin )
        simMax = max( sim_i, simMax )
    
    diffSpan = simMax - simMin
    
    print( [simMin, simMax,] )

In [7]:
if _LINK_PAGES and (not os.path.isfile( _DOC_EMBEDS )):
    with open( _DOC_EMBEDS, 'wb' ) as f:
        pickle.dump( vecPairs, f )

In [8]:
_SIM_MAX         =  0.7329191963152777
_SIM_MIN         = -0.24439346017677427
_MAX_BRANCH      = 10
_PAGE_CSN_THRESH = (_SIM_MAX-_SIM_MIN) * 0.95 + _SIM_MIN

## Calculate Page Similarity && Build Page Graph

In [None]:
_PAGE_LINKS = "data/PageLinks.pkl"

if _LINK_PAGES and (not os.path.isfile( _PAGE_LINKS )):
    pageLinks = deque()
    Ndocs     = len( vecPairs )
    tBgn      = now()
    for i in range( Ndocs-1 ):
        if ((i+1)%10000) == 0:
            m,s = divmod( now()-tBgn, 60 )
            print(f"\n{i+1},{int(m)}:{s:.2f}",end=' ',flush=True)
        elif ((i+1)%1000==0):
            print('>',end='',flush=True)
        elif ((i+1)%100==0):
            print('~',end='',flush=True)
        elif ((i+1)%10==0):
            print('.',end='',flush=True)
        
        pair_i = vecPairs[i]
        Nconn  = 0
        for j in range( i+1, Ndocs ):
            pair_j = vecPairs[j]
            sim_ij = cosine_similarity( [pair_i['vec'], pair_j['vec'],] )[0,1]
            if sim_ij >= _PAGE_CSN_THRESH:
                pageLinks.append( {
                    'type'   : "Page_Cosine_Similarity",
                    'idTail' : pair_i['id'],
                    'idHead' : pair_j['id'],
                    'dir'    : False,
                    'coSim'  : sim_ij,
                } )
                Nconn += 1
                if Nconn >= _MAX_BRANCH:
                    break
    pageLinks = list( pageLinks )
    print( f"\nBuilt page graph in {(now()-tBgn)/60.0/60.0:.2f} hours!" )
    print()

    with open( _PAGE_LINKS, 'wb' ) as f:
        pickle.dump( pageLinks, f )
            

.........~..

In [None]:
# FIXME, START HERE: CONNECT TO GRAPH DB 
    # * STORE DOCUMENTS WITH EMBEDDINGS
    # * STORE EDGES WITH SIMILARITY

# FIXME: SAVE THE DB AND BACK UP THE FILES

# Depth 2: Link Passages by Similarity

## Segment Pages Into Passages

In [None]:
def page_to_sentences( pageText ):
    """ Parse the page into individual sentences """
    rtnParts = deque()
    pageText = str( pageText ) + '.' # Terminator hack
    sepChars = ['.','?','!']
    sentence = ""

    # FIXME, START HERE: A PERIOD MUST BE FOLLOWED BY AT LEAST A SPACE OTHERWISE SENTENCES END AT DECIMALS

    def push_chunk():
        nonlocal rtnParts, sentence
        rtnParts.append( sentence )
        sentence = ""
    
    for c in pageText:
        sentence += c # Include punctuation in the sentence
        if c in sepChars:
            push_chunk()

    chunks = list( rtnParts )

    # FIXME: REMOVE LEADING/TRAILING SPACE
    
    return chunks 


# FIXME: REMEMBER TO LINK PASSAGES TO THEIR PARENT PAGES!!!!!


def sentences_to_passages( chunkList, embedder, segDiv = 10.0 ):
    """ Segment a list of sentences into a passage """
    vectors = embedder.embed_documents( chunkList )
    senVecs = zip( vectors, chunkList )
    rtnSeg  = deque()

    def vec_diff( v1, v2 ):
        """ Distance between 2 pnts """
        return np.linalg.norm( np.subtract( v1, v2 ) )

    def get_total_width():
        nonlocal vectors
        N    = len( vectors )
        dMax = -1.0
        for i in range( N-1 ):
            for j in range( i+1, N ):
                dMax = max( dMax, vec_diff( vectors[i], vectors[j] ) )
        return dMax

    segRad = get_total_width() / segDiv
    
    vecs_j = list()
    mean_j = None
    txtP_j = ""

    for i, (vec_i, sen_i) in enumerate( senVecs ):

        if i == 0:
            mean_j = vec_i

        if vec_diff( vec_i, mean_j ) > segRad:
            rtnSeg.append({
                'vec' : embedder.embed_documents( [txtP_j,] )[0], # Embedding might be different than the average?
                'txt' : txtP_j,
            })
            vecs_j = [vec_i,]
            mean_j = vec_i
            txtP_j = sen_i
        else:
            vecs_j.append( vec_i )
            mean_j = np.mean( vecs_j, axis = 0 )
            txtP_j += ' ' + sen_i # Reinsert leading space

    return list( rtnSeg )
        

In [None]:
# for result in collection.get()['documents']:
# # for result in collection.get()['ids']: # 6f83d661-6cdf-46bf-83a4-27f6c36d948f
#     # print( result )
#     print( dir( result ) )
#     break
#     # for res in result['ids']:
#     #     all_ids.append( res )
# # docIDs = list( all_ids )
# # print( f"There are {len(docIDs)} documents!" )
# # print( docIDs[0] )

# Depth 3: Build Statements (Assumptions and Claims) about Keywords

## Segment Keywords
* Statistically important/rare phrases
* Ask LLM to isolate {jargon, technical terms}

## Build Statements (Assumptions and Claims)
* S-V-O sentence-by-sentence

# Depth 4: Support/Refute Statements (Assumptions and Claims)

# Advanced Knowledge Graph Queries

## Statements: What is the difference between an Assumption and a Claim?

## What assumptions do we trust?

## What claims are supported by assumptions we trust?

## What questions are being asked?

## What questions are being answered?

## Does the document create new trustworthy connections?

## Can we find a similar subgraph in a different field of research?

# KNNEST: Knowledge Graph Structure Notes
* How to know sources are in different fields?
* New heirarchical embedding per field?

# ANTs: Search Agent Swarm

## Swarm-Level Load Management
* Agent Instantiation Condition(s)
* Agent Deletion Condition(s)
* Task Start Condition(s)
* Task Stop Condition(s)

## ANT Decision-Making Architecture
* Resource alotment: {Time, Compute}
* Critical: What edge to follow?
* A strong line of reasoning can be a demonstration trajectory? LfD?
* Inverse-RL to produce an evaluation function for trajectories thru the graph

# Connect to Graph Database

In [None]:
# from time import sleep

# import subprocess

# def start_arango_container():
#     command = [ "docker", "run", "-p", 
#                 "8529:8529", "-e", "ARANGO_ROOT_PASSWORD=", "arangodb/arangodb"]  
#     subprocess.Popen( command )

# start_arango_container()
# sleep( 15.0 )

In [None]:
# # Instantiate ArangoDB Database
# import json

# from adb_cloud_connector import get_temp_credentials
# from arango import ArangoClient

# con = get_temp_credentials()

# db = ArangoClient(hosts=con["url"]).db(
#     con["dbName"], con["username"], con["password"], verify=True
# )

# print(json.dumps(con, indent=2))

In [None]:
# # Instantiate the ArangoDB-LangChain Graph
# from langchain_community.graphs import ArangoGraph

# graph = ArangoGraph( db )

In [None]:
# if not db.has_graph( environ["_GRG_GRAPH_DB"] ):
#     db.create_graph(
#         environ["_GRG_GRAPH_DB"],
#         edge_definitions=[
#             {
#                 "from_vertex_collections": ["subjects"],
#                 "edge_collection": "verbs",
#                 "to_vertex_collections": ["subjects"],
#             },
#         ],
#     )

In [None]:
# import os

# import docker

# client     = docker.from_env()
# containers = client.containers.list()

# for container in containers:
#     print(container.name, container.short_id, container.status)

# os.system( "docker stop 8ed55910d8cc" )