# Resources
* [AI-Scientist](https://github.com/SakanaAI/AI-Scientist )
* [LangChain + Neo4j GRG Tutorial](https://python.langchain.com/docs/tutorials/graph/)
* [Enhancing RAG-based application accuracy by constructing and leveraging knowledge graphs](https://blog.langchain.dev/enhancing-rag-based-applications-accuracy-by-constructing-and-leveraging-knowledge-graphs/)

# Preliminaries + Installs

These instructions are for Python 3.10
### Install Notebook 04 Dependencies!
* `sudo apt install docker.io`
* `sudo chmod 666 /var/run/docker.sock`
* `python3.10 -m pip install docker --user`
### Install ArangoDB + Docker Container
* `sudo docker pull arangodb`
* `python3.10 -m pip install python-arango adb-cloud-connector --user`



# Inspiration
* [Co-STORM @ Stanford](https://storm.genie.stanford.edu/)
* [Ellicit](https://elicit.com/)
* [Research Rabbit](https://www.researchrabbit.ai/)

# Init + Env

In [1]:
########## INIT ####################################################################################
from os import path, makedirs, environ
from utils import copy_pdfs



########## ENVIRONMENT #############################################################################

##### 04: Basic RAG #######################################################
environ["_RAG_DOC_DBASE"]  = "lit_pdf"
environ["_RAG_DOC_EMBED"]  = "all-minilm"

##### 05: Graph-RAG (GRG) #################################################
environ["_GRG_MODEL_NAME"] = "llama3.2-vision"
environ["_GRG_EMBED_NAME"] = "all-minilm"

environ["_GRG_GRAPH_DB"] = "grg_rel"

##### Flags ###############################################################
_LINK_PAGES = True
_DOC_EMBEDS = "data/DocVectors.pkl"

# Depth 1: Link PDF Pages by Similarity

## Retrieve 04 Embeddings

In [2]:
%%capture
# Loading the vector store sometimes spews warnings
import sys

__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop( 'pysqlite3' )
import chromadb
from langchain_chroma import Chroma


persistent_client = chromadb.PersistentClient();
collection        = persistent_client.get_or_create_collection( environ["_RAG_DOC_DBASE"] );

In [3]:
from collections import deque

# Iterate over all documents and collect the IDs
all_ids = deque()
allData = collection.get()
totDocs = allData['documents']
totIDs  = allData['ids']

# print( type(allData) )

In [4]:
from langchain_ollama import OllamaEmbeddings

from utils import pull_ollama_model

pull_ollama_model( environ["_RAG_DOC_EMBED"] )

local_embeddings = OllamaEmbeddings( model = environ["_RAG_DOC_EMBED"] )

About to save 'all-minilm'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest 
pulling 797b70c4edf8...   0% ▕                ▏    0 B/ 45 MB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 797b70c4edf8...   0% ▕                ▏    0 B/ 45 MB                  [?25h[?25l[2K[1G[A[2K

In [7]:
# NOTE: If you don’t have enough VRAM it will use the CPU. 

import time, os
now = time.time
import numpy as np

vecPairs = deque()
docBatch = 128

if _LINK_PAGES and (not os.path.isfile( _DOC_EMBEDS )):
    bgn = now()
    N   = len( totDocs )
    bgn = 0
    end = 0
    # for i, doc in enumerate( totDocs ):
    while end < N:
        end = min( bgn+docBatch, N )
        try:
            vec = np.array( local_embeddings.embed_documents( totDocs[ bgn:end ] ) )
            for i in range( bgn, end ):
                vecPairs.append( {'vec' : vec[i-bgn], 'doc': totDocs[i], 'id' : totIDs[i]} )
                if ((i+1)%100) == 0:
                    print('.',end='',flush=True)
                if ((i+1)%10000) == 0:
                    m,s = divmod( now()-bgn, 60 )
                    print(f"\n{i+1},{int(m)}:{s:.2f}",end=' ',flush=True)
            bgn = end
        except Exception as e:
            print(e,end=', ',flush=True)
    print( f"\nPage embedding recalc took {(now()-bgn)/60.0:.2f} minutes!" )

vecPairs = list( vecPairs )
print( f"Got {len( vecPairs )} vectors!" )

.................

KeyboardInterrupt: 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

simMin = 1e9
simMax = 0.0
vectrs = [item['vec'] for item in vecPairs]
vec0   = vectrs[0]

for vec_i in vectrs[1:]:
    sim_i  = cosine_similarity( [vec0, vec_i,] )[0,1]
    simMin = min( sim_i, simMin )
    simMax = max( sim_i, simMax )

print( [simMin, simMax,] )

In [None]:
import pickle
if (not os.path.isfile( _DOC_EMBEDS )):
    with open( _DOC_EMBEDS, 'wb' ) as f:
        pickle.dump( vecPairs, f )

In [None]:
# print( list( allData.keys() ) )
# print( type( allData['documents'] ) )
# print( type( allData['documents'][0] ), len( allData['documents'] ) )
# print( allData['documents'][0] )

In [None]:
# for result in collection.get()['documents']:
# # for result in collection.get()['ids']: # 6f83d661-6cdf-46bf-83a4-27f6c36d948f
#     # print( result )
#     print( dir( result ) )
#     break
#     # for res in result['ids']:
#     #     all_ids.append( res )
# # docIDs = list( all_ids )
# # print( f"There are {len(docIDs)} documents!" )
# # print( docIDs[0] )

# Connect to Graph Database

In [None]:
# from time import sleep

# import subprocess

# def start_arango_container():
#     command = [ "docker", "run", "-p", 
#                 "8529:8529", "-e", "ARANGO_ROOT_PASSWORD=", "arangodb/arangodb"]  
#     subprocess.Popen( command )

# start_arango_container()
# sleep( 15.0 )

In [None]:
# # Instantiate ArangoDB Database
# import json

# from adb_cloud_connector import get_temp_credentials
# from arango import ArangoClient

# con = get_temp_credentials()

# db = ArangoClient(hosts=con["url"]).db(
#     con["dbName"], con["username"], con["password"], verify=True
# )

# print(json.dumps(con, indent=2))

In [None]:
# # Instantiate the ArangoDB-LangChain Graph
# from langchain_community.graphs import ArangoGraph

# graph = ArangoGraph( db )

In [None]:
# if not db.has_graph( environ["_GRG_GRAPH_DB"] ):
#     db.create_graph(
#         environ["_GRG_GRAPH_DB"],
#         edge_definitions=[
#             {
#                 "from_vertex_collections": ["subjects"],
#                 "edge_collection": "verbs",
#                 "to_vertex_collections": ["subjects"],
#             },
#         ],
#     )

In [None]:
# import os

# import docker

# client     = docker.from_env()
# containers = client.containers.list()

# for container in containers:
#     print(container.name, container.short_id, container.status)

# os.system( "docker stop 8ed55910d8cc" )