# Ollama

In [None]:
# Download and install OLLAMA
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# enter docker container in terminal
# docker exec -it <DOCKER_CONTAINER_ID> /bin/bash

# In a separate terminal (or tmux screen) run the following command to start the Ollama server:
# ollama start
# ---or pull and run the model at the same time in the terminal---
# ollama run <MODEL_NAME>

https://ollama.com/library?sort=popular

https://huggingface.co/spaces/open-llm-leaderboard/comparator
- compare models

In [None]:
!ollama pull qwen2.5:3b-instruct-q8_0

In [None]:
!ollama list

In [None]:
import subprocess, os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

# model_name, ctx_len = "gpt-4o-2024-08-06", 128000
# model_name, ctx_len = "llama3.2:1b", 128000
# model_name, ctx_len = "bespoke-minicheck", 32000
model_name, ctx_len = "qwen2.5:3b-instruct-q8_0", 32000

if "gpt-4o" in model_name:
    openai_key = os.getenv("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = openai_key
    
    print(f"Using OpenAI {model_name}...")
    llm = OpenAI(model=model_name, max_tokens=8000)
    
addtion_kwargs = {"max_new_tokens": 8000}
llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                request_timeout=4000.0, additional_kwargs=addtion_kwargs)
print(llm.metadata)

In [None]:
response = llm.complete("What is the meaning of life?")
print(response)

### Convert any model to GGUF

In [None]:
# For faster compilation, use ccache (only if you will recompile llama.cpp)

# import os

# # Install package
# !apt update && apt install -y ccache

# # Update symlinks
# !/usr/sbin/update-ccache-symlinks

# # Prepend ccache into the PATH
# os.environ['PATH'] = "/usr/lib/ccache:" + os.environ['PATH']

# !echo $PATH

In [None]:
# Download and install LLAMA.CPP (15-20 mins)
!cd /workspace/repos/ && git clone https://github.com/ggerganov/llama.cpp
!cd /workspace/repos/llama.cpp && git pull && make clean && LLAMA_CUDA=0 make
!chmod 755 /workspace/repos/llama.cpp/requirements.txt && pip3 install -r /workspace/repos/llama.cpp/requirements.txt

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel

save_dir = "/workspace/data/new_embed_model"

# tokenizer_name = "HuggingFaceTB/SmolLM-135M-Instruct"
# model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
tokenizer_name = "mixedbread-ai/mxbai-embed-large-v1"
model_name = "mixedbread-ai/mxbai-embed-large-v1"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModel.from_pretrained(model_name,
                                return_dict=True,
                                torch_dtype=torch.float16,
                                device_map="auto",
                                trust_remote_code=True
                                )
# fine tune the model
# train a new model 
# peft the model
# .
# .
# .

print('Saving model')
model.save_pretrained(save_dir)
print('Saving tokenizer')
tokenizer.save_pretrained(save_dir)

In [None]:
# Convert model to gguf format
!python3 /workspace/repos/llama.cpp/convert_hf_to_gguf.py /workspace/data/new_embed_model

In [None]:
# SmolLM-135M-Instruct-F16.gguf is the resulting name in save_dir from the gguf conversion (see inside new_embed_model directory)
# make a Modelfile for the new_embed_model
!cd /workspace/data/new_embed_model && echo 'FROM "/workspace/data/new_embed_model/mxbai-embed-large-v1-F16.gguf"' >> Modelfile
# add new_embed_model to the Ollama registry
!ollama create new_embed_model -f /workspace/data/new_embed_model/Modelfile

In [None]:
!ollama list

# Army of specialized models

In [None]:
# Run multiple Ollama models 
from llama_index.embeddings.ollama import OllamaEmbedding

# The default Ollama server runs at http://127.0.0.1:11434, so we arbitrarily increment the port number by 1
# From a second tmux screen start a new ollama server with the model
# OLLAMA_HOST="http://127.0.0.1:11435" ollama start 

embed_model = OllamaEmbedding(
                              model_name="new_embed_model:latest",
                              base_url="http://127.0.0.1:11435",
                              ollama_additional_kwargs={"mirostat": 0},
)

# LlamaIndex

In [None]:
# TODO: notify if ollama server is running with model loaded
import subprocess, os
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI as LOpenAI
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

# model_name, ctx_len = "gpt-4o-2024-08-06", 128000
# model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
# model_name, ctx_len = "bespoke-minicheck", 32000
model_name, ctx_len = "qwen2.5:3b-instruct-q8_0", 128000

if "gpt-4o" in model_name:
    openai_key = os.getenv("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = openai_key
    
    print(f"Using OpenAI {model_name}...")
    llm = LOpenAI(model=model_name, max_tokens=8000)
else:
    subout = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
    if model_name in subout.stdout:
        print('Model loaded...')
    else:
        try: 
            print("Pulling Ollama model...")
            sub_out = subprocess.run(['ollama', 'pull', model_name], capture_output=True, text=True)
        except Exception as e: 
            print(f"Error pulling model: Is the Ollama server running?\n{e}")
    
    addtion_kwargs = {"max_new_tokens": 8000}
    llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                 request_timeout=4000.0, additional_kwargs=addtion_kwargs)
    print(llm.metadata)


### Know Your Rule Proposal (KYRP)

In [None]:
from federal_register.client import FederalRegister
import requests

# Initialize the client.
federal_register_client = FederalRegister()

# Grab a specific document.
federal_document = federal_register_client.document_by_id(
    document_id='2024-10738',
    fields='all'
)

# Print it out.
raw_text_url = federal_document['raw_text_url']
title = federal_document['title']
headers = {"accept": "*/*"}
response = requests.get(raw_text_url, headers=headers)
print("TITLE: ",title)
print()
print(response.text[:500])

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

rule_proposal = remove_html_tags(response.text)
print(rule_proposal[:500])

In [None]:
excerpt = """Subpart A--General

Sec.  1032.100  Definitions.

    Refer to Sec.  1010.100 of this chapter for general definitions not 
noted herein. To the extent there is a differing definition in Sec.  
1010.100, the definition in this section is what applies to part 1032. 
Unless otherwise indicated, for purposes of this part:
    (a) Account. For purposes of Sec.  1032.220:
    (1) Account means any contractual or other business relationship 
between a person and an investment adviser under which the investment 
adviser provides investment advisory services.
    (2) Account does not include:
    (i) An account that the investment adviser acquires through any 
acquisition, merger, purchase of assets, or assumption of liabilities.
    (ii) [Reserved]
    (b) Commission means the United States Securities and Exchange 
Commission.
    (c) Customer. For purposes of Sec.  1032.220:
    (1) Customer means:
    (i) A person that opens a new account; and
    (ii) An individual who opens a new account for:
    (A) An individual who lacks legal capacity, such as a minor; or
    (B) An entity that is not a legal person, such as a civic club.
    (2) Customer does not include:
    (i) A financial institution regulated by a Federal functional 
regulator or a bank regulated by a State bank regulator;
    (ii) A person described in Sec.  1020.315(b)(2) through (4) of this 
chapter; or
    (iii) A person that has an existing account with the investment 
adviser, provided the investment adviser has a reasonable belief that 
it knows the true identity of the person.
    (d) Financial institution is defined at 31 U.S.C. 5312(a)(2) and 
(c)(1) and its implementing regulation in Chapter X of Title 31.
    (e) Investment adviser. Any person who is registered or required to 
register with the Commission under section 203 of the Investment 
Advisers Act of 1940 (15 U.S.C. 80b-3(a)), or any person that is exempt 
from Commission registration under sections 203(l) or 203(m) of the 
Investment Advisers Act of 1940 (15 U.S.C. 80b-3(l), (m))."""

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

documents = [Document(text=excerpt, metadata={"title": "CFR Rule Proposal"})]
parser = SentenceSplitter(chunk_size=200, chunk_overlap=20, tokenizer=None)
nodes = parser.get_nodes_from_documents(documents)
len(nodes)

In [None]:
# If the LLM is a closed model, then this becomes distillation
result_collection = []
for node in nodes:
    result = llm.complete(f"""Create 5 questions with answers based on the text below.
                              Answers should vary in length and complexity.
                              Include your reasoning for each answer.
                              Return the questions in the following format:
                              Q1: ...? 
                              A1: ... 
                              Reasoning: ...
                        
                              Q2: ...? 
                              A2: ...
                              Reasoning: ...
                              
                              Text:
                              {node.text}""")
    result_collection.append(result.text)

In [None]:
print(result_collection[0])

# Fine tune model...

# LlamaIndex RAG

### Embeddings

https://huggingface.co/spaces/mteb/leaderboard

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel

save_dir = "/workspace/data/new_embed_model"

tokenizer = AutoTokenizer.from_pretrained(save_dir)
embed_model = AutoModel.from_pretrained(save_dir,
                                             return_dict=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                             trust_remote_code=True
                                             )

### Or

In [None]:
# Run multiple Ollama models 
from llama_index.embeddings.ollama import OllamaEmbedding

# The default Ollama server runs at http://127.0.0.1:11434, so we arbitrarily increment the port number by 1
# From a second tmux screen start a new ollama server with the model
# OLLAMA_HOST="http://127.0.0.1:11435" ollama start 

embed_model = OllamaEmbedding(
                              model_name="new_embed_model:latest",
                              base_url="http://127.0.0.1:11435",
                              ollama_additional_kwargs={"mirostat": 0},
)

In [None]:
# Vector Database RAG
from llama_index.core.postprocessor import LLMRerank
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex(nodes=nodes, 
                                embed_model=embed_model, 
                                show_progress=True,
                            )
query_engine = vector_index.as_query_engine(
    llm=llm,
    similarity_top_k=5,
    node_postprocessors=[
        LLMRerank(
            llm=llm,
            choice_batch_size=5,
            top_n=2,
        )
    ],
    # see https://github.com/run-llama/llama_index/blob/f7c5ee5efbb6172e819f26d1705fcdf6114b11a3/llama-index-core/llama_index/core/response_synthesizers/type.py#L4
    response_mode="tree_summarize", # "accumulate", "compact_accumulate", "compact", "simple_summarize", "tree_summarize"
)


In [None]:
response = query_engine.query(
    "What is the definition of 'Account'?",
)
print(response)


In [None]:
for source in response.source_nodes:
    
    print('TEXT:\n',source.text)
    print()
    print('SCORE:',source.score)
    print('ID:',source.node_id)
    print('-'*100)
    print()

# LlamaIndex Workflows

In [None]:
from typing import List
import subprocess, os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI as LOpenAI
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

from utils import parse_list_from_output_string, extract_list_from_string
from llama_index.core.workflow import (
    Context,
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    step,
)

from llama_index.core.agent.react import ReActAgent
from llama_index.core.llms import ChatMessage
from llama_index.core.tools import ToolSelection, ToolOutput

from llama_index.core.chat_engine import SimpleChatEngine
from llama_index.utils.workflow import draw_all_possible_flows
from llama_index.core.response_synthesizers import TreeSummarize

from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI as LOpenAI
from rag_utils import create_llama_vector_index_rag

import os
import xml.etree.ElementTree as ET
from sec_utils import get_tree_data, get_metadata
from llamaindex_data_utils import extract_text_from_pdf

import os, re
from llama_index.readers.file import PDFReader
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.ollama import OllamaEmbedding

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

import nest_asyncio
nest_asyncio.apply()

class QueryQualityEvent(Event):
    result: str

class InitializationEvent(Event):
    pass

class InitializationCleanupEvent(Event):
    result: str

class RegulationsExtractionEvent(Event):
    pass

class FormatCorrectionEvent(Event):
    result: list

class SummarizationEvent(Event):
    result: list

class SummarizationNumericalValidationEvent(Event):
    result: list
    summaries: list

class RuleSummarizationFlow(Workflow):

    # Similar to __init__, but for workflows
    @step
    async def initialize(self, ctx: Context, ev: StartEvent) -> RegulationsExtractionEvent:
        
        # Open source work horse model
        model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
        addtion_kwargs = {"max_new_tokens": 8000}
        ctx.data["llm"] = Ollama(model=model_name, 
                                 url="http://127.0.0.1:11434", 
                                 context_window=ctx_len, 
                                 model_type="chat", 
                                 is_function_calling_model=True,
                                 request_timeout=4000.0, 
                                 additional_kwargs=addtion_kwargs)
        
        # Expert closed model
        ctx.data["expert_llm"] = ctx.data["llm"] # OpenAI(model="gpt-4o",temperature=0.1)
        
        # Embedding model
        ctx.data["embed_model"] = OllamaEmbedding(model_name, base_url="http://localhost:11435")
        
        # Load the document
        documents_proposal = [Document(text=t, text_template='{metadata_str}\n\n{content}') for t in sections]

        # add metadata to the documents_proposal
        for i in range(len(documents_proposal)):
            documents_proposal[i].metadata["section"] = documents_proposal[i].text.split("\n")[0].strip()
            
        # Global state context
        ctx.data["chat_llm"] = SimpleChatEngine.from_defaults(llm=ctx.data["expert_llm"])
        ctx.data["summarizer_llm"] = TreeSummarize(llm=ctx.data["expert_llm"], verbose=False)
        ctx.data["new_rule_documents"] = documents_proposal
        
        ctx.data["initialized"] = True
        return RegulationsExtractionEvent()

    # steps only take in one Event and return one Event, but can have multiple types of return Events
    @step
    async def extract_regulations(self, ctx: Context, ev: RegulationsExtractionEvent) -> FormatCorrectionEvent:
        assert ctx.data["initialized"], "Workflow not initialized."
        
        regs_extraction=[]
        for i,sec in enumerate(ctx.data["new_rule_documents"]):

            rule_prompt = f"""
            Extract all mentions of any regulatory sections, rules, and acts in the following text.
            Compile all extracted items into a single Python List of Strings object.
            For example, ["Item 1", "Item 2", "Item 3"].
            Only return the Python List of Strings.

            Here is the text:
            {sec}
            """
            response = ctx.data["chat"].chat(rule_prompt)
            regs_extraction.append(response)
        
        return FormatCorrectionEvent(result=regs_extraction)
    
    @step
    async def correct_format(self, ctx: Context, ev: FormatCorrectionEvent) -> SummarizationEvent:
        
        regs_list = []
        for response in ev.result:
            # List checker
            rewrite_counter=0
            while True:
                try: 
                    
                    # extracted_reg = parse_list_from_output_string(response)
                    extracted_reg = extract_list_from_string(response)
                    break
                except:
                    print("Correcting list format...")
                    response = ctx.data["chat"].chat(f"""The following text does not contain a valid Python List of Strings? 
                                                Rewrite the text so that the List is in a valid Python format.
                                                For example, ["Item 1", "Item 2", "Item 3"].\nText:\n\n{response}
                                            """)
                    rewrite_counter+=1
                    print(f"   Rewrote list {rewrite_counter} times.")
                    if rewrite_counter>5:
                        raise ValueError("Could not correct list format.")
            regs_list.append(extracted_reg)

        return SummarizationEvent(result=regs_list)


    @step
    async def summarize_sections(self, ctx: Context, ev: SummarizationEvent) -> SummarizationNumericalValidationEvent | StopEvent: #SummarizationValidationEvent:
        
        if len(ctx.data["bad_summaries"])==0:
            return StopEvent(result=ctx.data["section_summaries"])
        elif "section_summaries" in ctx.data:
            print("   Bad summaries found. Correcting...")
            section_summaries = ctx.data["section_summaries"]
            summary_count = ctx.data["bad_summaries"]
            prompt_suffix = f"""\nThe first attempt at summarizing this section had numerical copy mistakes. Copy numbers exactly as they appear in the text."""
        else:
            section_summaries = [None]*ctx.data["num_sections"]
            summary_count = range(ctx.data["num_sections"])
            prompt_suffix = ""
        
        for i in summary_count:
            extracted_reg = ev.result[i]
            prompt_summary = f"""
            Summarize the content of the following section from a new SEC rule proposal or amendment. Ensure that the summary:

            1. Includes every reference to any specific regulatory section, rule, or act mentioned in the text (e.g., 12 U.S.C. 1843(k)(4)(C), 240.13a-15, Unfunded Mandates Reform Act (section 202(a)), Investment Company Act of 1940).
            2. Stays strictly within the information presented in the text, without incorporating any outside or prior knowledge.
            3. Is detailed, thorough, and specific in its coverage of all key points, definitions and exclusions.
            4. Avoids adding any introductory or concluding remarks outside the scope of the summary itself.

            Here are some of the regulatory sections, rules, and acts:
            {extracted_reg} 
            """
            print(f"   Summarizing section {i+1}/{ctx.data['num_sections']}...")
            parser = SentenceSplitter(chunk_size=500, chunk_overlap=20)
            nodes = parser.get_nodes_from_documents(ctx.data["new_rule_documents"], show_progress=True)
            response = ctx.data["summarizer"].get_response(prompt_summary+prompt_suffix, [doc.text for doc in nodes])
            section_summaries[i] = response
        
        ctx.data["section_summaries"] = section_summaries
        return SummarizationNumericalValidationEvent(result=ev.result, summaries=section_summaries)
    
    @step
    async def validate_summaries(self, ctx: Context, ev: SummarizationNumericalValidationEvent) -> SummarizationEvent:
        
        bad_summaries = []
        sections = [x.text for x in ctx.data["new_rule_documents"]]
        for i in ctx.data["bad_summaries"]:
            goods=0
            bads=0
            numbers = re.findall(r'(?<!\w)(\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+\.\d+|\d+)(?!\w)', ev.summaries[i])
            numbers = [x for x in numbers if len(x)>1]
            for number in numbers:
                if number not in sections[i]:
                    bads+=1
                    # print(f"{number} not in section {i}!")
                else:
                    goods+=1
        
            # TODO: this is a manual param. put in ctx.data
            if bads>7:
                bad_summaries.append(i)
            print(f"   Section {i} Goods: {goods}, Bads: {bads}")

        ctx.data["bad_summaries"] = bad_summaries            
        return SummarizationEvent(result=ev.result)
