### Short list of objectives

* Import pdf of Proposed Rule XXXXXXX
* Query rule for baseline responses 
* Fine-tune embedding model, recheck responses
* Summarize rule

In [1]:
# Set up ollama
!apt-get update && apt-get install tmux vim -y
!pip3 install llama-index llama-parse llama_deploy llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-llms-ollama llama-index-embeddings-ollama llama-index-vector-stores-neo4jvector llama-index-graph-stores-neo4j llama-index-finetuning llama-index-utils-workflow 
!pip3 install sentencepiece protobuf evaluate rouge_score absl-py tensorboardX bitsandbytes peft accelerate python-dotenv graspologic
!pip3 install flash-attn --no-build-isolation
!curl -fsSL https://ollama.com/install.sh | sh


Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      
Get:3 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [999 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:6 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1150 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]      
Get:8 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB] 
Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [3097 kB]
Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-sec

In [None]:
# Set up Neo4j
!apt install dialog apt-utils -y 
!wget -O - https://debian.neo4j.com/neotechnology.gpg.key | gpg --dearmor -o /etc/apt/keyrings/neotechnology.gpg
!echo 'deb [signed-by=/etc/apt/keyrings/neotechnology.gpg] https://debian.neo4j.com stable latest' | tee -a /etc/apt/sources.list.d/neo4j.list
!apt list -a neo4j
!add-apt-repository universe -y
!apt install neo4j=1:5.23.0 -y
!echo "neo4j-enterprise neo4j/question select I ACCEPT" | debconf-set-selections
!echo "neo4j-enterprise neo4j/license note" | debconf-set-selections
!apt install openjdk-17-jre -y
!cd /var/lib/neo4j/plugins/ && wget https://github.com/neo4j/apoc/releases/download/5.23.0/apoc-5.23.0-core.jar

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import subprocess, os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI as LOpenAI
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

# model_name, ctx_len = "gpt-4o", 128000
# model_name, ctx_len = "llama3.1:latest", 128000
model_name, ctx_len = "hermes3:8b", 128000
# model_name, ctx_len = "gemma2:27b", 8192


if model_name == "gpt-4o":
    openai_key = os.getenv("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = openai_key
    
    print(f"Using OpenAI {model_name}...")
    llm = LOpenAI(model=model_name, max_tokens=4000)
else:
    try: 
        print("Pulling Ollama model...")
        sub_out = subprocess.run(['ollama', 'pull', model_name], capture_output=True, text=True)
    except Exception as e: 
        print(f"Error pulling model: Is the Ollama server running?\n{e}")
    
    addtion_kwargs = {"max_new_tokens": 2000}
    # system_prompt = "You are an expert at answering questions about rules and regulations regarding Title 17—Commodity and Securities Exchanges: CHAPTER II—SECURITIES AND EXCHANGE COMMISSION. Please provide a summary of the following text, and cite any sections, rules, acts or laws (e.g. § 230.503, § 240.13a-15, Act (15 U.S.C. 781), Investment Company Act of 1940) from context that support the answer. Be detailed in your response."
    llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                 request_timeout=4000.0, additional_kwargs=addtion_kwargs) #, system_prompt=system_prompt)
    print(llm.metadata)

Settings.llm = llm

Pulling Ollama model...
context_window=128000 num_output=256 is_chat_model=True is_function_calling_model=False model_name='hermes3:8b' system_role=<MessageRole.SYSTEM: 'system'>


## Import data

In [4]:
# SEC rules and regulations
!cd /workspace/data && curl -X GET "https://www.ecfr.gov/api/versioner/v1/full/2024-07-23/title-17.xml?chapter=II" -H "accept: application/xml" > title-17.xml

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 16.3M  100 16.3M    0     0  10.2M      0  0:00:01  0:00:01 --:--:-- 10.2M


In [5]:
import xml.etree.ElementTree as ET
from sec_utils import get_tree_data

# Path to your XML file
xml_file_path = '/workspace/data/title-17.xml'

# Parse the XML file
tree = ET.parse(xml_file_path)

# Get the root element of the XML document
root = tree.getroot()

sec_data = get_tree_data(root)


In [6]:
from sec_utils import get_metadata
from llama_index.core import Document

documents = [Document(text=t, 
                          text_template='{metadata_str}\n\n{content}',
                          metadata=get_metadata(m, t, metadata={"section":None, "description":None, "mentioned_sections":None})) \
                            for m,t in sec_data.items()]

In [7]:
from llamaindex_data_utils import extract_text_from_pdf
import os
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

access_token = os.getenv('HF_TOKEN')
llama_api_key = os.getenv('LLAMA_API_KEY')

pdf_urls = ["/workspace/data/compliance/FSI_Factsheet.pdf", "/workspace/data/compliance/FSI_Press_Release.pdf", "/workspace/data/compliance/FSI_Rule_proposal.pdf"]
descriptions = ["Factsheet about newly proposed SEC rule.", "Press release regarding newly proposed SEC rule.", "The full text of the newly proposed SEC rule."]
documents_proposal = extract_text_from_pdf(pdf_urls, llama_api_key, llamaparse_kwargs={"split_by_page":False}, save_json_path=None)
# add metadata to the documents_proposal
for i in range(len(documents_proposal)):
    documents_proposal[i].metadata["source"] = pdf_urls[i]
    documents_proposal[i].metadata["description"] = descriptions[i]
    
# documents_proposal.extend(docs)

processing pdf: /workspace/data/compliance/FSI_Factsheet.pdf
Started parsing the file under job_id 2828f768-2b3c-42ae-bc64-01aa436d2599
.processing pdf: /workspace/data/compliance/FSI_Press_Release.pdf
Started parsing the file under job_id fd1e9869-6e4e-4dc4-a114-1af3b518bfdf
processing pdf: /workspace/data/compliance/FSI_Rule_proposal.pdf
Started parsing the file under job_id b6f0513b-8967-478b-88b6-ffa7c7873fa4


### Create Distillation Datasets

In [None]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(chunk_size=300, chunk_overlap=50)
nodes = parser.get_nodes_from_documents(documents_proposal, show_progress=True)
len(nodes)

In [None]:
system_prompt = 'Context information is below.\n\n---------------------\n{context_str}\n---------------------\n\nGiven the context information and no prior knowledge, generate {num_questions_per_chunk} questions based on the below query.\n\nYou are a FINRA certified Compliance Specialist that writes exams for firm compliance professionals. Your task is to write precise questions for an upcoming Compliance Officer certification examination. The questions should be diverse across the context with no multiple choice. Restrict the questions to the context information provided."\n'

In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.llms.openai import OpenAI
import random

output_path = "/workspace/data/train_dataset_proposal.json"
if not os.path.exists(output_path):
    rand_index = random.sample(range(len(nodes)), len(nodes))
    train_perc=1.0
    train_size = int(len(rand_index)*train_perc)

    train_dataset = generate_qa_embedding_pairs(
        qa_generate_prompt_tmpl = system_prompt,
        num_questions_per_chunk=5,
        save_every=20,
        output_path=output_path,
        llm=llm, 
        nodes=[nodes[x] for x in rand_index[:train_size]],
        verbose=False
    )
    train_dataset.save_json(output_path)

    # val_dataset = generate_qa_embedding_pairs(
    #     qa_generate_prompt_tmpl = system_prompt,
    #     num_questions_per_chunk=5,
    #     save_every=500,
    #     output_path="/workspace/data/val_dataset.json",
    #     llm=llm, 
    #     nodes=[nodes[x] for x in rand_index[train_size:]],
    #     verbose=False
    # )
    # # assert len(val_dataset.relevant_docs) == len(val_dataset.queries)
    # assert (np.unique(list(val_dataset.relevant_docs.values()), return_counts=True)[1]==5).all()
    # val_dataset.save_json("/workspace/data/val_dataset.json")



### Pretrain Embeddings

In [None]:
%load_ext autoreload
%autoreload 2

#####################
# in llama_index/embeddings/huggingface/base need to add "show_progress_bar=False" to 204, 213
#####################

import os, json

from llama_index.finetuning import EmbeddingAdapterFinetuneEngine, SentenceTransformersFinetuneEngine
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings


train_path = "/workspace/data/train_dataset_proposal.json"
val_path = None #"/workspace/data/val_dataset.json"
if os.path.exists(train_path):
    train_dataset = EmbeddingQAFinetuneDataset.from_json(train_path)
if val_path is not None and os.path.exists(val_path):
    val_dataset = EmbeddingQAFinetuneDataset.from_json(val_path)
else:
    val_dataset = None

embed_model_name = "dunzhang/stella_en_1.5B_v5" #7b params
print("loading embed model...")

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    embed_model_name,
    batch_size=6,
    model_output_path="/workspace/data/rule_proposal_embedding_model",
    val_dataset=val_dataset,
    epochs=4,
    show_progress_bar=True,
    # can optionally pass along any parameters that go into `train_model`
    # optimizer_class=torch.optim.SGD,
    # optimizer_params={"lr": 0.0001, "weight_decay": 0.01}
)


# finetune_engine = EmbeddingAdapterFinetuneEngine(
#     train_dataset,
#     base_embed_model,
#     batch_size=10,
#     model_output_path="/workspace/data/adapter_model",
#     # val_dataset=val_dataset,
#     epochs=4,
#     verbose=False,
#     # can optionally pass along any parameters that go into `train_model`
#     # optimizer_class=torch.optim.SGD,
#     # optimizer_params={"lr": 0.001}
# )

In [None]:
finetune_engine.finetune(**{"lr": 0.001, "weight_decay": 0.0001})

In [6]:
import gc, torch

# Clear memory
del finetune_engine, train_dataset
gc.collect()
torch.cuda.empty_cache()

## Embedding Model

In [8]:

import os
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model_name = "dunzhang/stella_en_1.5B_v5" #7b params
finetuned_embed_model_name = "/workspace/data/compliance/rule_proposal_embedding_model_gpt"
print("loading embed model...")
proposal_embed_model = HuggingFaceEmbedding(model_name=finetuned_embed_model_name)
rules_embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

# Settings.embed_model = embed_model
# Settings.chunk_size = 300
# Settings.chunk_overlap = 50

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





loading embed model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





README.md:   0%|          | 0.00/174k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

In [6]:
from typing import Literal

entities = Literal[#"PROPOSED_RULE", 
                   "ACTS",
                   "SECTIONS",
                   "RULES"
                   #"AMENDMENTS"
]

relations = Literal[
    "MENTIONS",
    #"CHANGES",
    # "AMENDS", 
    "REFERS_TO"
]

validation_schema = {
    # "Proposed_Rules": ["CHANGES", "AMENDS"],
    "Acts": ["MENTIONS","REFERS_TO"],
    "Sections": ["MENTIONS","REFERS_TO"],
    "Rules": ["MENTIONS","REFERS_TO"],
    # "Amendments": ["REFERS_TO", "AMENDS"],
}


In [None]:
from rag_utils import set_neo4j_password, add_lines_to_conf
set_neo4j_password('bewaretheneo')
# add_lines_to_conf()

###### START NEO4J SERVER ######


In [None]:
# GraphRAG Database
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor, SimpleLLMPathExtractor
from rag_utils import create_neo4j_graph_store, create_neo4j_graphrag, neo4j_query, dump_neo4j_database
import random
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

access_token = os.getenv('HF_TOKEN')
llama_api_key = os.getenv('LLAMA_API_KEY')

graph_idx_persist_dir = "/workspace/data/compliance/graph_idx_testfull"
graph_store_persist_dir= None #"/workspace/data/graph_store"

Settings.chunk_size = 500
Settings.chunk_overlap = 20

# kg_extractor = SchemaLLMPathExtractor(
#     llm=llm,
#     possible_entities=entities,
#     possible_relations=relations,
#     kg_validation_schema=validation_schema,
#     strict=False,  # if false, will allow triples outside of the schema``
#     num_workers=10,
#     max_triplets_per_chunk=10,
# )


# extract_prompt = "You are an expert compliance officer with vast knowledge of SEC Title 17 Chapter II. Your job is to read each section and link mentions of other sections, rules, or acts (e.g. § 230.503, § 240.13a-15, Act (15 U.S.C. 781), Investment Company Act of 1940) mentioned. If there are no mentions of other sections, rules, or acts, return an empty list."
# extract_prompt = "You are an expert compliance officer with vast knowledge of SEC Title 17 Chapter II. Your job is to read each section and link other sections, rules, and acts mentioned. What sections, rules, and acts (e.g. § 230.503, § 240.13a-15, Act (15 U.S.C. 781), Investment Company Act of 1940) are mentioned in the content? If there are no mentions, return an empty list."
llm.is_function_calling_model = False
extract_prompt = "You are an expert compliance officer with vast knowledge of SEC Title 17 Chapter II. Your job is to find semantic, referential, and literal relationships between the sections. If there are no relationships, return an empty list."
kg_extractor = SimpleLLMPathExtractor(
        extract_prompt=extract_prompt,
        llm=llm,
        max_paths_per_chunk=10,
        num_workers=6,
    )

# random.shuffle(documents)

print("Creating graph store...")
graph_store = create_neo4j_graph_store(neo_url="bolt://localhost:7687", 
                                       password=os.getenv("NEO4J_PWD"), 
                                       config={"connection_timeout": 1000, "connection_acquisition_timeout": 1000, "max_connection_pool_size": 1000})

if not os.path.exists(graph_idx_persist_dir):
    print("Deleting all nodes and relationships...")
    neo4j_query(graph_store, query="""MATCH n=() DETACH DELETE n""")

print("Creating graphrag index...")
graph_index = create_neo4j_graphrag(documents, llm, rules_embed_model, kg_extractor, graph_store, graph_idx_persist_dir=graph_idx_persist_dir, graph_store_persist_dir=graph_store_persist_dir, similarity_top_k=3)

dump_neo4j_database('neo4j', '/workspace/data/')

In [None]:
query_engine = graph_index.as_query_engine()

In [7]:
neo4j_query(graph_store, query="""MATCH n=() DETACH DELETE n""")
graph_store.close()

In [9]:
# Vector Database RAG
from llama_index.core.postprocessor import LLMRerank
from rag_utils import create_llama_vector_index_rag

Settings.chunk_size = 300
Settings.chunk_overlap = 20

vector_index = create_llama_vector_index_rag(llm, proposal_embed_model, documents=documents_proposal, persist_dir="/workspace/data/compliance/vector_idx_proposed")
query_engine = vector_index.as_query_engine(
    similarity_top_k=3,
    # node_postprocessors=[
    #     LLMRerank(
    #         choice_batch_size=5,
    #         top_n=2,
    #     )
    # ],
    # see https://github.com/run-llama/llama_index/blob/f7c5ee5efbb6172e819f26d1705fcdf6114b11a3/llama-index-core/llama_index/core/response_synthesizers/type.py#L4
    response_mode="tree_summarize", # "accumulate", "compact_accumulate", "compact", "simple_summarize", "tree_summarize"
)

Settings.chunk_size = 500
Settings.chunk_overlap = 20

rules_vector_index = create_llama_vector_index_rag(llm, rules_embed_model, documents=documents, persist_dir="/workspace/data/compliance/vector_idx_sec")
graph_index = rules_vector_index.as_query_engine(
    similarity_top_k=3,
    # node_postprocessors=[
    #     LLMRerank(
    #         choice_batch_size=5,
    #         top_n=2,
    #     )
    # ],
    # see https://github.com/run-llama/llama_index/blob/f7c5ee5efbb6172e819f26d1705fcdf6114b11a3/llama-index-core/llama_index/core/response_synthesizers/type.py#L4
    response_mode="tree_summarize", # "accumulate", "compact_accumulate", "compact", "simple_summarize", "tree_summarize"
)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/194 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Parsing nodes:   0%|          | 0/3390 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1828 [00:00<?, ?it/s]

In [None]:
response = query_engine.query("Give a summary of the entire proposed rule. What are the key points?")

In [11]:
from llama_index.core.agent.react import ReActAgent
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.llms import ChatMessage
from llama_index.core.tools import ToolSelection, ToolOutput
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.workflow import (
    step,
    Context,
    Workflow,
    Event,
    StartEvent,
    StopEvent
)
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.chat_engine import SimpleChatEngine
from llama_index.utils.workflow import draw_all_possible_flows
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


In [28]:
dir(Workflow)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_done',
 '_get_steps',
 '_start',
 '_step_functions',
 '_validate',
 'add_step',
 'add_workflows',
 'is_done',
 'run',
 'run_step',
 'send_event',
 'stream_events']

In [12]:
#### Define the workflow and all events
# class InputEvent(Event):
#     input: list[ChatMessage]

# class ToolCallEvent(Event):
#     tool_calls: list[ToolSelection]

# class FunctionOutputEvent(Event):
#     output: ToolOutput

class InitializeEvent(Event):
    pass

class JudgeEvent(Event):
    query: str

class BadQueryEvent(Event):
    query: str

class NaiveRAGEvent(Event):
    query: str

class HighTopKEvent(Event):
    query: str

class RerankEvent(Event):
    query: str

class ResponseEvent(Event):
    query: str
    response: str

class SummarizeEvent(Event):
    query: str
    response: str

In [None]:
class ComplianceWorkflow(Workflow):

    def load_or_create_index(self, persist_dir, documents=None):
        # Check if the index already exists
        if os.path.exists(persist_dir):
            print("Loading existing index...")
            # Load the index from disk
            storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
            index = load_index_from_storage(storage_context)
        else:
            print("Creating new index...")
            # Load documents from the specified directory
            # TODO: add a check for the documents

            # Create a new index from the documents
            index = VectorStoreIndex.from_documents(documents)

            # Persist the index to disk
            index.storage_context.persist(persist_dir=persist_dir)

        return index
    
    # @step(pass_context=True)
    # async def initialize(self, ctx: Context, ev: InitializeEvent) -> ConciergeEvent:
    #     ctx.data["user"] = {
    #         "username": None,
    #         "session_token": None,
    #         "account_id": None,
    #         "account_balance": None,
    #     }
    #     ctx.data["success"] = None
    #     ctx.data["redirecting"] = None
    #     ctx.data["overall_request"] = None

    #     ctx.data["llm"] = OpenAI(model="gpt-4o",temperature=0.4)

    @step(pass_context=True)
    async def judge_query(self, ctx: Context, ev: StartEvent | JudgeEvent ) -> BadQueryEvent | NaiveRAGEvent | HighTopKEvent | RerankEvent:
        # TODO: make initialize its own event
        # initialize
        if not hasattr(ctx.data, "llm"):
            # Open source work horse model
            model_name, ctx_len = "hermes3:8b", 128000
            addtion_kwargs = {"max_new_tokens": 2000}
            # system_prompt = "You are an expert at answering questions about rules and regulations regarding Title 17—Commodity and Securities Exchanges: CHAPTER II—SECURITIES AND EXCHANGE COMMISSION. Please provide a summary of the following text, and cite any sections, rules, acts or laws (e.g. § 230.503, § 240.13a-15, Act (15 U.S.C. 781), Investment Company Act of 1940) from context that support the answer. Be detailed in your response."
            ctx.data["llm"] = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=True, 
                                     request_timeout=4000.0, additional_kwargs=addtion_kwargs) #, system_prompt=system_prompt)
            
            # Expert closed model
            ctx.data["expert_llm"] = ctx.data["llm"] # OpenAI(model="gpt-4o",temperature=0.1)
            
            # Embedding models
            embed_model_name = "dunzhang/stella_en_1.5B_v5" 
            finetuned_embed_model_name = "/workspace/data/compliance/rule_proposal_embedding_model_gpt"
            proposal_embed_model = HuggingFaceEmbedding(model_name=finetuned_embed_model_name)
            sec_embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
            
            ctx.data["new_rule_index"] = create_llama_vector_index_rag(llm=ctx.data["llm"], 
                                                                       embed_model=proposal_embed_model, 
                                                                       persist_dir="/workspace/data/compliance/vector_idx_proposed",
                                                                       vector_store_kwargs={"chunk_size": 300, "chunk_overlap": 20})
            
            ctx.data["sec_index"] = create_llama_vector_index_rag(llm=ctx.data["llm"], 
                                                                  embed_model=sec_embed_model, 
                                                                  persist_dir="/workspace/data/compliance/vector_idx_sec",
                                                                  vector_store_kwargs={"chunk_size": 500, "chunk_overlap": 20})
            
            # we use a chat engine so it remembers previous interactions
            ctx.data["judge"] = SimpleChatEngine.from_defaults(llm=ctx.data["expert_llm"])

        response = ctx.data["judge"].chat(f"""
            Given a user query, determine if this is likely to yield good results from a RAG system as-is. If it's good, return 'good', if it's bad, return 'bad'.
            Good queries use a lot of relevant keywords and are detailed. Bad queries are vague or ambiguous.

            Here is the query: {ev.query}
            """)
        if response == "bad":
            # try again
            return BadQueryEvent(query=ev.query)
        else:
            # send query to all 3 strategies
            self.send_event(NaiveRAGEvent(query=ev.query))
            self.send_event(HighTopKEvent(query=ev.query))
            self.send_event(RerankEvent(query=ev.query))

    @step(pass_context=True)
    async def improve_query(self, ctx: Context, ev: BadQueryEvent) -> JudgeEvent:
        response = ctx.data["llm"].complete(f"""
            This is a query to a RAG system: {ev.query}

            The query is bad because it is too vague. Please provide a more detailed query that includes specific keywords and removes any ambiguity.
        """)
        return JudgeEvent(query=str(response))

    @step(pass_context=True)
    async def naive_rag(self, ctx: Context, ev: NaiveRAGEvent) -> ResponseEvent:
        index = ctx.data["index"]
        engine = index.as_query_engine(similarity_top_k=5)
        response = engine.query(ev.query)
        print("Naive response:", response)
        return ResponseEvent(query=ev.query, source="Naive", response=str(response))

    @step(pass_context=True)
    async def high_top_k(self, ctx: Context, ev: HighTopKEvent) -> ResponseEvent:
        index = ctx.data["index"]
        engine = index.as_query_engine(similarity_top_k=20)
        response = engine.query(ev.query)
        print("High top k response:", response)
        return ResponseEvent(query=ev.query, source="High top k", response=str(response))

    @step(pass_context=True)
    async def rerank(self, ctx: Context, ev: RerankEvent) -> ResponseEvent:
        index = ctx.data["index"]
        reranker = RankGPTRerank(
            top_n=5,
            llm=ctx.data["expert_llm"]
        )
        retriever = index.as_retriever(similarity_top_k=20)
        engine = RetrieverQueryEngine.from_args(
            retriever=retriever,
            node_postprocessors=[reranker],
        )
        response = engine.query(ev.query)
        print("Reranker response:", response)
        return ResponseEvent(query=ev.query, source="Reranker", response=str(response))

    @step(pass_context=True)
    async def judge(self, ctx: Context, ev: ResponseEvent) -> StopEvent:
        ready = ctx.collect_events(ev, [ResponseEvent]*3)
        if ready is None:
            return None

        response = ctx.data["judge"].chat(f"""
            A user has provided a query and 3 different strategies have been used
            to try to answer the query. Your job is to decide which strategy best
            answered the query. The query was: {ev.query}

            Response 1 ({ready[0].source}): {ready[0].response}
            Response 2 ({ready[1].source}): {ready[1].response}
            Response 3 ({ready[2].source}): {ready[2].response}

            Please provide the number of the best response (1, 2, or 3).
            Just provide the number, with no other text or preamble.
        """)

        best_response = int(str(response))
        print(f"Best response was number {best_response}, which was from {ready[best_response-1].source}")
        return StopEvent(result=str(ready[best_response-1].response))

In [None]:
draw_all_possible_flows(ConciergeWorkflow,filename="concierge_flows.html")

In [None]:

graph_engine_tools = QueryEngineTool(
            query_engine=graph_index,
            metadata=ToolMetadata(
                name="sec_title_17_chapter_ii_tool",
                description=(
                    "Contains all the current sections, rules, and relationships of SEC Title 17 Chapter II."
                ),
            ),
        )

query_engine_tools = QueryEngineTool(
            query_engine=query_engine,
            metadata=ToolMetadata(
                name="new_rule_proposal_tool",
                description=(
                    "Contains all the information about the newly proposed SEC rule."
                ),
            ),
        )


In [13]:
from typing import Any, List

from llama_index.core.llms.function_calling import FunctionCallingLLM
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.tools.types import BaseTool
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, step


class FuncationCallingAgent(Workflow):
    def __init__(
        self,
        *args: Any,
        llm: FunctionCallingLLM | None = None,
        tools: List[BaseTool] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.tools = tools or []

        self.llm = llm
        assert self.llm.metadata.is_function_calling_model

        self.memory = ChatMemoryBuffer.from_defaults(llm=llm)
        self.sources = []

    @step
    async def prepare_chat_history(self, ev: StartEvent) -> InputEvent:
        # clear sources
        self.sources = []

        # get user input
        user_input = ev.input
        user_msg = ChatMessage(role="user", content=user_input)
        self.memory.put(user_msg)

        # get chat history
        chat_history = self.memory.get()
        return InputEvent(input=chat_history)

    @step
    async def handle_llm_input(
        self, ev: InputEvent
    ) -> ToolCallEvent | StopEvent:
        chat_history = ev.input
        '''
        Takes in the chat history, and uses tools to generate a response.
        '''
        response = await self.llm.achat_with_tools(
            self.tools, chat_history=chat_history
        )
        self.memory.put(response.message)

        tool_calls = self.llm.get_tool_calls_from_response(
            response, error_on_no_tool_call=False
        )

        if not tool_calls:
            return StopEvent(
                result={"response": response, "sources": [*self.sources]}
            )
        else:
            return ToolCallEvent(tool_calls=tool_calls)

    @step
    async def handle_tool_calls(self, ev: ToolCallEvent) -> InputEvent:
        tool_calls = ev.tool_calls
        tools_by_name = {tool.metadata.get_name(): tool for tool in self.tools}

        tool_msgs = []

        # call tools -- safely!
        for tool_call in tool_calls:
            tool = tools_by_name.get(tool_call.tool_name)
            additional_kwargs = {
                "tool_call_id": tool_call.tool_id,
                "name": tool.metadata.get_name(),
            }
            if not tool:
                tool_msgs.append(
                    ChatMessage(
                        role="tool",
                        content=f"Tool {tool_call.tool_name} does not exist",
                        additional_kwargs=additional_kwargs,
                    )
                )
                continue

            try:
                tool_output = tool(**tool_call.tool_kwargs)
                self.sources.append(tool_output)
                tool_msgs.append(
                    ChatMessage(
                        role="tool",
                        content=tool_output.content,
                        additional_kwargs=additional_kwargs,
                    )
                )
            except Exception as e:
                tool_msgs.append(
                    ChatMessage(
                        role="tool",
                        content=f"Encountered error in tool call: {e}",
                        additional_kwargs=additional_kwargs,
                    )
                )

        for msg in tool_msgs:
            self.memory.put(msg)

        chat_history = self.memory.get()
        return InputEvent(input=chat_history)

In [None]:
from llama_index.core.tools import FunctionTool
from llama_index.llms.openai import OpenAI

llm.is_function_calling_model = True

agent = FuncationCallingAgent(
    llm=llm, tools=all_tools, timeout=120, verbose=True
)

ret = await agent.run(input="What is a summary of the proposed rule, and what SEC rules does it change?")

In [None]:
print(ret["response"])


In [None]:
from llama_index.utils.workflow import draw_all_possible_flows
from llama_index.utils.workflow import draw_most_recent_execution
from IPython.display import display, HTML
draw_all_possible_flows(FuncationCallingAgent, "first_func_agent.html")

In [None]:
from IPython.display import IFrame

IFrame(src='/workspace/repos/agentic-ai/first_func_agent.html', width=700, height=600)

In [None]:
from IPython.display import display, HTML

# Read the HTML file content
with open('/workspace/repos/agentic-ai/first_func_agent.html', 'r') as file:
    html_content = file.read()

# Render the HTML content in a Jupyter cell
display(HTML(html_content))

In [5]:
from typing import Optional
import asyncio
from pydantic_settings import BaseSettings
import signal
from llama_deploy.deploy.deploy import (
                        _get_message_queue_client,
                        _deploy_local_message_queue,
                        _get_shutdown_handler
                    )

from llama_deploy import (
    deploy_core,
    ControlPlaneConfig,
    SimpleMessageQueueConfig,
    SimpleOrchestratorConfig,
    ControlPlaneServer,
    SimpleOrchestrator,
    LlamaDeployClient
)


In [None]:
from llama_deploy import (
    deploy_core,
    ControlPlaneConfig,
    SimpleMessageQueueConfig,
)


async def main():
    await deploy_core(
        control_plane_config=ControlPlaneConfig(port=8002),
        message_queue_config=SimpleMessageQueueConfig(port=8003),
    )


# if __name__ == "__main__":
#     import asyncio

#     asyncio.run(main())

In [None]:
from llama_deploy import (
    deploy_workflow,
    WorkflowServiceConfig,
    ControlPlaneConfig,
    SimpleMessageQueueConfig,
)
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, step


# create a dummy workflow
class MyWorkflow(Workflow):
    @step()
    async def run_step(self, ev: StartEvent) -> StopEvent:
        # Your workflow logic here
        arg1 = str(ev.get("arg1", ""))
        result = arg1 + "_result"
        return StopEvent(result=result)


async def main():
    await deploy_workflow(
        workflow=MyWorkflow(),
        workflow_config=WorkflowServiceConfig(
            host="127.0.0.1", port=8004, service_name="my_workflow"
        ),
        control_plane_config=ControlPlaneConfig(),
    )


if __name__ == "__main__":
    import asyncio

    asyncio.run(main())

In [None]:
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Context,
)
import random
from llama_index.core.workflow import draw_all_possible_flows
from llama_index.utils.workflow import draw_most_recent_execution

In [22]:
from typing import Optional
import asyncio
from pydantic_settings import BaseSettings
import signal
from llama_deploy.deploy.deploy import (
                        _get_message_queue_client,
                        _deploy_local_message_queue,
                        _get_shutdown_handler
                    )

from llama_deploy import (
    deploy_core,
    ControlPlaneConfig,
    SimpleMessageQueueConfig,
    SimpleOrchestratorConfig,
    ControlPlaneServer,
    SimpleOrchestrator,
    LlamaDeployClient
)


async def deploy_core(
    control_plane_config: ControlPlaneConfig,
    message_queue_config: BaseSettings,
    orchestrator_config: Optional[SimpleOrchestratorConfig] = None,
) -> None:
    orchestrator_config = orchestrator_config or SimpleOrchestratorConfig()

    message_queue_client = _get_message_queue_client(message_queue_config)

    control_plane = ControlPlaneServer(
        message_queue_client,
        SimpleOrchestrator(**orchestrator_config.model_dump()),
        **control_plane_config.model_dump(),
    )

    message_queue_task = None
    if isinstance(message_queue_config, SimpleMessageQueueConfig):
        message_queue_task = _deploy_local_message_queue(message_queue_config)

    control_plane_task = asyncio.create_task(control_plane.launch_server())

    # let services spin up
    await asyncio.sleep(1)

    # register the control plane as a consumer
    control_plane_consumer_fn = await control_plane.register_to_message_queue()

    consumer_task = asyncio.create_task(control_plane_consumer_fn())

    # let things sync up
    await asyncio.sleep(1)

    # let things run
    if message_queue_task:
        all_tasks = [control_plane_task, consumer_task, message_queue_task]
    else:
        all_tasks = [control_plane_task, consumer_task]

    shutdown_handler = _get_shutdown_handler(all_tasks)
    loop = asyncio.get_event_loop()
    while loop.is_running():
        await asyncio.sleep(0.1)
        signal.signal(signal.SIGINT, shutdown_handler)

        for task in all_tasks:
            if task.done() and task.exception():  # type: ignore
                raise task.exception()  # type: ignore

In [24]:
import httpx
from llama_index.core.workflow import Workflow
from llama_deploy import (
    WorkflowServiceConfig,
    WorkflowService,
)

from llama_deploy.deploy.deploy import (
                        _get_message_queue_config,
                    )

async def deploy_workflow(
    workflow: Workflow,
    workflow_config: WorkflowServiceConfig,
    control_plane_config: ControlPlaneConfig,
) -> None:
    control_plane_url = control_plane_config.url

    async with httpx.AsyncClient() as client:
        response = await client.get(f"{control_plane_url}/queue_config")
        queue_config_dict = response.json()

    message_queue_config = _get_message_queue_config(queue_config_dict)
    message_queue_client = _get_message_queue_client(message_queue_config)

    service = WorkflowService(
        workflow=workflow,
        message_queue=message_queue_client,
        **workflow_config.model_dump(),
    )

    service_task = asyncio.create_task(service.launch_server())

    # let service spin up
    await asyncio.sleep(1)

    # register to message queue
    consumer_fn = await service.register_to_message_queue()

    # register to control plane
    control_plane_url = (
        f"http://{control_plane_config.host}:{control_plane_config.port}"
    )
    await service.register_to_control_plane(control_plane_url)

    # create consumer task
    consumer_task = asyncio.create_task(consumer_fn())

    # let things sync up
    await asyncio.sleep(1)

    all_tasks = [consumer_task, service_task]

    shutdown_handler = _get_shutdown_handler(all_tasks)
    loop = asyncio.get_event_loop()
    while loop.is_running():
        await asyncio.sleep(0.1)
        signal.signal(signal.SIGINT, shutdown_handler)

        for task in all_tasks:
            if task.done() and task.exception():  # type: ignore
                raise task.exception()  # type: ignore

In [27]:
from llama_agents import (
    AgentService,
    ToolService,
    MetaServiceTool,
    ControlPlaneServer,
    SimpleMessageQueue,
    AgentOrchestrator,
)

from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import ReActAgentWorker, ReActAgent



# create our multi-agent framework components
message_queue = SimpleMessageQueue()
control_plane = ControlPlaneServer(
    message_queue=message_queue,
    orchestrator=AgentOrchestrator(llm=llm),
)

# define Tool Service
tool_service = ToolService(
    message_queue=message_queue,
    tools=all_tools,
    running=True,
    step_interval=0.5,
)

# define meta-tools here
meta_tools = [
    await MetaServiceTool.from_tool_service(
        t.metadata.name,
        message_queue=message_queue,
        tool_service=tool_service,
    )
    for t in all_tools
]


# define Agent and agent service
# worker1 = FunctionCallingAgentWorker.from_tools(
worker1 = ReActAgentWorker.from_tools(
    meta_tools,
    llm=llm,
)
agent1 = worker1.as_agent()
agent_server_1 = AgentService(
    agent=agent1,
    message_queue=message_queue,
    description="Agent that answers questions based on the newly proposed SEC rule.",
    service_name="rule_proposal_agent",
)

In [None]:
response = agent1.chat('When are written comments on this notice of joint proposed rulemaking need to be submitted?')
print(response.response)

In [None]:
# questions
# Name of rule 
# Is it a proposed rule or a final rule
# Issue date and Federal Register date
# What agency(ies) is the Rule coming from
# If a proposed rule, when are public comments due by and where should they be sent (this info is in the Rule document under Dates and Addresses)

## Tree summarizer

In [None]:
from llama_index.core.response_synthesizers import TreeSummarize
summarizer = TreeSummarize(llm=llm, verbose=True)
# prompt_summary = "You are a professional executive of AlphaTrAI. Your job is to summarize this text in great detail from a video transcription. The summary will be distributed to investors and stakeholders, so give a lot of details and examples from the transcription."
prompt_summary = f"""You are a professional executive at AlphaTrAI. Your job is to summarize the text from a video transcription. The summary will be a memo distributed to investors and stakeholders. Be sure it the memo has the following items:
1. Extract all the names of new hires and their position, and/or new advisors mentioned in the transcription.
2. Create a section to mention the personnel new to AlphaTrAI.
3. Include other highlights and progress made by AlphaTrAI.
4. Ensure the memo and ensure it is factual, optimistic, and any values mention come directly from the text. 

The transcription is as follows:\n{full_doc}"""

response = await summarizer.aget_response(prompt_summary, [doc.text for doc in documents])

In [None]:
print(response)

## LLM direct summarization

In [None]:
prompt_summary = f"""You are a professional executive at AlphaTrAI. Your job is to summarize the text from a video transcription. The summary will be a memo distributed to investors and stakeholders. Be sure it the memo has the following items:
1. Extract all the names of new hires and their position, and/or new advisors mentioned in the transcription.
2. Include other highlights and progress made by AlphaTrAI.
3. Ensure the memo is professional, fluid, factual, and optimistic. 

The transcription is as follows:\n{full_doc}"""

response = llm.complete(prompt_summary, max_tokens=5000)

In [None]:
print(response.text)

## Agentic Summary

In [None]:
!pip3 install llama-index-embeddings-huggingface llama-index-vector-stores-neo4jvector llama-index-graph-stores-neo4j
!apt install dialog apt-utils -y (done above)
!wget -O - https://debian.neo4j.com/neotechnology.gpg.key | gpg --dearmor -o /etc/apt/keyrings/neotechnology.gpg
!echo 'deb [signed-by=/etc/apt/keyrings/neotechnology.gpg] https://debian.neo4j.com stable latest' | tee -a /etc/apt/sources.list.d/neo4j.list
!apt list -a neo4j
!add-apt-repository universe -y
!apt install neo4j=1:5.22.0 -y
!echo "neo4j-enterprise neo4j/question select I ACCEPT" | debconf-set-selections
!echo "neo4j-enterprise neo4j/license note" | debconf-set-selections
!apt install openjdk-17-jre -y
!cd /var/lib/neo4j/plugins/ && wget https://github.com/neo4j/apoc/releases/download/5.22.0/apoc-5.22.0-core.jar

In [None]:
set_neo4j_password('bewaretheneo')
add_lines_to_conf()


In [None]:
from dotenv import load_dotenv
load_dotenv()

from llama_index.core.agent import ReActAgent
# from llama_index.llms.openai import OpenAI
from llama_index.core.tools import FunctionTool
from rag_utils import create_neo4j_graph_store, create_neo4j_graphrag, neo4j_query, set_neo4j_password, add_lines_to_conf
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

import nest_asyncio
nest_asyncio.apply()



In [None]:

llm.is_function_calling_model = True

embed_model_name = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
print("loading embed model...")
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

Settings.embed_model = embed_model
Settings.chunk_size = 300
Settings.chunk_overlap = 50

In [None]:
from typing import Literal
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

entities = Literal["PEOPLE", 
                   "PLACE"
]

relations = Literal[
    "ROLE",
    "COMPANY"
]

validation_schema = {
    "People": ["ROLE"],
    "Place": ["COMPANY"],
}


In [None]:
Settings.chunk_size = 300
Settings.chunk_overlap = 50

kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    strict=True,  # if false, will allow triples outside of the schema
    num_workers=4,
    max_triplets_per_chunk=10,
)

graph_store = create_neo4j_graph_store(neo_url="bolt://localhost:7687", 
                                       password=os.getenv("NEO4J_PWD"), 
                                       config={"connection_timeout": 240, "connection_acquisition_timeout": 240, "max_connection_pool_size": 1000})
neo4j_query(graph_store, query="""MATCH (n) DETACH DELETE n""")


graph_index = create_neo4j_graphrag(documents, llm, embed_model, kg_extractor, graph_store)

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

query_engine_tools = QueryEngineTool(
            query_engine=graph_index,
            metadata=ToolMetadata(
                name="graph_tool",
                description=(
                    "Useful for finding people names and roles, and the company they work for."
                ),
            ),
        ),


In [None]:
!pip3 install llama-agents

In [None]:
from llama_agents import (
    AgentService,
    ToolService,
    MetaServiceTool,
    ControlPlaneServer,
    SimpleMessageQueue,
    AgentOrchestrator,
)

from llama_index.core.agent import FunctionCallingAgentWorker, ReActAgentWorker, ReActAgent, LATSAgentWorker



# create our multi-agent framework components
message_queue = SimpleMessageQueue()
control_plane = ControlPlaneServer(
    message_queue=message_queue,
    orchestrator=AgentOrchestrator(llm=llm),
)

# define Tool Service
tool_service = ToolService(
    message_queue=message_queue,
    tools=[query_engine_tools],#, adding_tool],
    running=True,
    step_interval=0.5,
)

# define meta-tools here
meta_tools = [
    await MetaServiceTool.from_tool_service(
        t.metadata.name,
        message_queue=message_queue,
        tool_service=tool_service,
    )
    for t in [query_engine_tools]#, adding_tool]
]


# define Agent and agent service
# worker1 = FunctionCallingAgentWorker.from_tools(
worker1 = ReActAgentWorker.from_tools(
    meta_tools,
    llm=llm,
)

worker2 = LATSAgentWorker.from_tools(
    meta_tools,
    llm=llm,
    num_expansions=2,
    max_rollouts=3,
    verbose=True
)

agent1 = worker1.as_agent()
agent_server_1 = AgentService(
    agent=agent1,
    message_queue=message_queue,
    description="Summarize a transcription as a memo for investors and stakeholders.",
    service_name="summarize_transcription",
)