In [37]:
# TODO: notify if ollama server is running with model loaded
import subprocess, os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI as LOpenAI
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

# model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
model_name, ctx_len = "qwen2.5:3b-instruct-q8_0", 128000
# ollama pull hf.co/mradermacher/SaulLM-54B-Instruct-i1-GGUF:Q6_K
# ollama pull hf.co/mradermacher/SaulLM-54B-Instruct-i1-GGUF:Q4_K_M
# ollama pull hf.co/bartowski/Llama-3.1-Nemotron-70B-Instruct-HF-GGUF:Q4_K_M


if "gpt-4o" in model_name:
    openai_key = os.getenv("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = openai_key
    
    print(f"Using OpenAI {model_name}...")
    llm = LOpenAI(model=model_name, max_tokens=8000)
else:
    subout = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
    if model_name in subout.stdout:
        print('Model loaded...')
    else:
        try: 
            print("Pulling Ollama model...")
            sub_out = subprocess.run(['ollama', 'pull', model_name], capture_output=True, text=True)
        except Exception as e: 
            print(f"Error pulling model: Is the Ollama server running?\n{e}")
    
    system_prompt = "You are training an new Portfolio Manager of a hedgefund."
    additional_kwargs = {"num_predict": 4000}
    llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=True, 
                 request_timeout=4000.0, additional_kwargs=additional_kwargs, json_mode=False) #, system_prompt=system_prompt)
    print(llm.metadata)

# Settings.llm = llm

Model loaded...
context_window=128000 num_output=256 is_chat_model=True is_function_calling_model=True model_name='qwen2.5:3b-instruct-q8_0' system_role=<MessageRole.SYSTEM: 'system'>


In [3]:
import json, os
from llama_parse import LlamaParse
from llama_index.core import Document
from dotenv import load_dotenv
load_dotenv('/workspace/repos/project-mayhem/.env')

import nest_asyncio
nest_asyncio.apply()

llama_api_key = os.getenv("LLAMA_API_KEY")

def extract_text_from_pdf(pdf_urls, llama_api_key, llamaparse_kwargs={}, save_json_path=None):
    
    parser = LlamaParse(api_key=llama_api_key, **llamaparse_kwargs)
    
    documents = []
    for pdf in pdf_urls:
        print('processing pdf:', pdf)
        documents += parser.load_data(pdf)

    if save_json_path:
        with open(save_json_path, "r") as f:
            result = json.load(f)
            documents.append(Document(text=result['text']))
    
    return documents

pages_to_extract = ""
beginning_of_chapter = 21
end_of_chapter = 24 # 644
for i in range(beginning_of_chapter,end_of_chapter):
    if i == end_of_chapter - 1:
        pages_to_extract += str(i)
    else:
        pages_to_extract += str(i) + ","
principles_of_finance = "https://assets.openstax.org/oscms-prodcms/media/documents/PrinciplesofFinance-WEB.pdf"
documents = extract_text_from_pdf([principles_of_finance], llama_api_key, llamaparse_kwargs={"split_by_page": False, "target_pages": pages_to_extract}, save_json_path=None)


processing pdf: https://assets.openstax.org/oscms-prodcms/media/documents/PrinciplesofFinance-WEB.pdf
Started parsing the file under job_id bca0ff4f-ad37-4a77-99a7-988ca9351bb7
............

In [4]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding

# Embedding model
# OLLAMA_HOST="http://127.0.0.1:11435" ollama start 
embed_model_name = "bge-m3"
embed_model = OllamaEmbedding(embed_model_name, base_url="http://localhost:11435")

splitter = SemanticSplitterNodeParser(buffer_size=1, embed_model=embed_model, include_metadata=True)
nodes = splitter.get_nodes_from_documents(documents, show_progress=True)
print(f"Number of nodes: {len(nodes)}")

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/75 [00:00<?, ?it/s]

Number of nodes: 5


In [None]:
from datasets import load_dataset
import json

data_path = "microsoft/orca-agentinstruct-1M-v1"
dataset = load_dataset(data_path)
orca_keys = list(dataset.keys())

okey = orca_keys[1]
teststr = dataset[okey][0]['messages']
# Convert the string to a list of dictionaries
list_of_dicts = json.loads(teststr)

print(okey)
list_of_dicts

In [5]:
from instruction_prompts import all_questions
from parse_instruction_output import all_parsers

In [6]:
question_list = list(all_questions.keys())

In [48]:
response = llm.complete(all_questions['word_definition'].format(financial_text=nodes[0].text))

In [None]:
import hashlib
import uuid

def generate_random_hash():
    random_uuid = uuid.uuid4()
    hash_object = hashlib.sha256(random_uuid.bytes)
    return hash_object.hexdigest()
import os, json
from instruction_prompts import all_questions
from parse_instruction_output import all_parsers

if not os.path.exists('/workspace/data/uvu_lit/pm_synth_train_dataset'):
    os.makedirs('/workspace/data/uvu_lit/pm_synth_train_dataset')

# for task in all_questions:
############################################################
task='question_answering_generation_from_facts'
############################################################
print(f"  Processing {task}...")
corpus = {}
collection = {}
output_path = f"/workspace/data/uvu_lit/pm_synth_train_dataset/dataset_{task}.json"
corpus_path = f"/workspace/data/uvu_lit/pm_synth_train_dataset/corpus_{task}.json"
for node_counter, node in enumerate(nodes[:3]):
    instruct_prompt = all_questions[task]        
    parser = all_parsers[task]
    prompt = instruct_prompt.format(financial_text=node.text)
    
    while True:
        response = llm.complete(prompt)
        clean_response = parser(response.text)
        
        counter = 0
        if clean_response == []:
            if counter == 2:
                print(f"    Failed to parse {task} for {node.metadata['filename']}...")
                break
            counter += 1
            continue
        else:
            if isinstance(clean_response, str):
                clean_response = [clean_response]

            hashes = [generate_random_hash() for _ in range(len(clean_response))]
            
            corpus[node.node_id] = node.text
            for i in range(len(hashes)):
                collection[hashes[i]] = {'response': clean_response[i],
                                        'hash_id': hashes[i],
                                        'relevant_doc': node.node_id,
                                        'task': task}
            break
    
    if node_counter % max(5,int(len(nodes) / 10)) == 0 or node_counter == len(nodes) - 1:
        if os.path.exists(output_path):
            with open(output_path, 'a') as f:
                json.dump(collection, f)
            collection = {}
        else:
            with open(output_path, 'w') as f:
                json.dump(collection, f)
            
        if os.path.exists(corpus_path):
            with open(corpus_path, 'a') as f:
                json.dump(corpus, f)
            corpus = {}
        else:
            with open(corpus_path, 'w') as f:
                json.dump(corpus, f)
        print(f"    Processed {node_counter+1}/{len(nodes)} nodes...")  

In [None]:
question_list[0]