# for vast ai - enter in terminal
!python3 -m pip install ipykernel -U --user --force-reinstall && apt update && apt install -y python3-pip

!pip3 install flash-attn --no-build-isolation
!pip3 install llama-index llama-parse llama-index-embeddings-huggingface llama-index-llms-huggingface llama-agents dspy-ai openpyxl langchain chromadb
!pip3 install sentencepiece protobuf evaluate rouge_score absl-py tensorboardX bitsandbytes peft accelerate
!cp /workspace/repos/agentic-ai/MASTER\ -\ PYTHON\ -\ SCORING\ MODEL\ -\ MCG\ MADISON\ RIDGE\ DST\ -\ v2.0.xlsx /workspace/data
!cp /workspace/repos/agentic-ai/PPM\ -\ MCG\ MADISON\ RIDGE\ DST.pdf /workspace/data
!pip3 uninstall -y torch
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import gc
import os

import pandas as pd
import numpy as np
from transformers import BitsAndBytesConfig

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext
from llama_index.core.embeddings import resolve_embed_model
from llama_parse import LlamaParse
from llama_index.llms.huggingface import HuggingFaceLLM

import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from llama_index.readers.file import PandasExcelReader
# CHROMA_COLLECTION_NAME = "blockchain_and_ai"
# CHROMADB_DIR = "/workspace/data/db/"
# from dspy.retrieve.chromadb_rm import ChromadbRM

from typing import List, Any, Callable, Optional
from pydantic import BaseModel

import torch
from transformers import AutoModelForCausalLM

from train_utils import get_csv_string, randomize_row_values, operators_dict, range_description_json, split_df_by_empty_columns, split_df_by_empty_rows, print_trainable_parameters
from models import SpreadSheetAnalyzer

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

import nest_asyncio
nest_asyncio.apply()


In [None]:
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
disposition_inputs = [
  "Selling Costs",
  "Disposition Fee",
  "Net Operating Income",
  "Loan Assumption/Payoff",
  "Return of Forecasted Reserves",
  "CF Y 11",
  "Return of Maximum Offering Amount",
  "Projected Terminal Cap Rate",
  "Cash Flows"
]
dfs = pd.read_excel(filepath, sheet_name="5 - Disposition Analysis", header=None)
# Splitting the DataFrame by empty columns
sub_dfs_by_columns = split_df_by_empty_columns(dfs)

# Splitting each sub-DataFrame by empty rows
final_split_dfs = []
for sub_df in sub_dfs_by_columns:
    split_sub_dfs = split_df_by_empty_rows(sub_df)
    final_split_dfs.extend([get_csv_string(x) for x in split_sub_dfs if not x.empty])

dfs.dropna(axis=0, how='all', inplace=True)
dfs.dropna(axis=1, how='all', inplace=True)
fee_columns = ['Disposition Fee', 'Selling Costs']
cashflow_columns = [1,2,3,4,5,6,7,8,9]
ground_truth = dfs[dfs[1].isin(disposition_inputs+cashflow_columns)].iloc[:, :2] # Get only the necessary columns
ground_truth.drop(labels=[16, 17], axis=0, inplace=True) # drop the duplicate Selling and Disposition Costs



In [None]:
access_token = os.getenv('HF_TOKEN')
llama_api_key = os.getenv('LLAMA_API_KEY')

print('first model load...')
# model_name = "EleutherAI/gpt-neo-125m"
# model_name = "microsoft/Phi-3-mini-128k-instruct" # 128K context window
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # 8K context window
# model_name = "clibrain/mamba-2.8b-instruct-openhermes" # 8K context window
# model_name = "Qwen/Qwen2-1.5B-Instruct"
model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
llm = dspy.HFModel(model=model_name, hf_device_map='auto', token=access_token)
llm.kwargs['max_new_tokens']=100
# llm.kwargs['repetition_penalty']=1.1
llm.kwargs['temperature']=None
llm.kwargs['do_sample']=False
llm.kwargs['top_k']=None
# llm.kwargs['typical_p']=0.9

print('deleting model...')
llm.model=None
gc.collect()
print('reloading model...')

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

llm.model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.bfloat16)


# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["k_proj", "v_proj", "q_proj", "o_proj"], # Mistral param names
#     lora_dropout=0.05,
#     bias="none", #"none", "all", "lora_only"
#     task_type="CAUSAL_LM", 
    
# )

# llm.model = prepare_model_for_kbit_training(llm.model)
# llm.model = get_peft_model(llm.model, config)
# print_trainable_parameters(llm.model)

if model_name == 'mistralai/Mistral-7B-Instruct-v0.3':
    llm.model.generation_config.pad_token_id = llm.tokenizer.eos_token_id
    llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

dspy.settings.configure(lm=llm)

######## RAG model
# chroma_client = chromadb.PersistentClient(path=CHROMADB_DIR)
# collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
# # text_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=100)

# ids = []
# documents = []
# metadatas = []
# # dfs_str = get_csv_string(dfs)
# # chunks = text_splitter.create_documents([dfs_str], )
# for chunk_no, chunk in enumerate(final_split_dfs):
#     ids.append(f"{chunk_no}")
#     documents.append(chunk)
#     # metadatas.append({"title":})
# if ids:
#     collection.upsert(ids=ids, documents=documents)#, metadatas=metadatas)

# retriever = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
# default_ef = embedding_functions.HuggingFaceEmbeddingFunction(model_name='colbert-ir/colbertv2.0', api_key=access_token)
# default_ef = embedding_functions.DefaultEmbeddingFunction()
# retriever = ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, default_ef, k=3)

# dspy.settings.configure(lm=llm, rm=retriever)

In [None]:
access_token = os.getenv('HF_TOKEN')
llama_api_key = os.getenv('LLAMA_API_KEY')
parser = LlamaParse(
    api_key=llama_api_key,
        result_type="text",
        language="en",
        varbose=True
    )

!rm -rf /root/.cache/huggingface/hub/

!pip3 install llama-index-embeddings-text-embeddings-inference

In [None]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine


# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
model_name = "Qwen/Qwen2-1.5B-Instruct"
tokenizer_name = model_name
# rm_llm = HuggingFaceLLM(model=dspy_llm.llm, tokenizer_name=tokenizer_name, is_chat_model=True, device_map='auto', max_new_tokens=50, context_window=8000)
rm_llm = HuggingFaceLLM(model_name=model_name, tokenizer_name=tokenizer_name, model=llm.model, is_chat_model=True, device_map='auto', max_new_tokens=50, context_window=8000)
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
documents = PandasExcelReader(sheet_name="5 - Disposition Analysis").load_data(filepath)
# documents = parser.load_data("/workspace/data/PPM - MCG MADISON RIDGE DST.pdf")
print("Documents created")

Settings.llm = rm_llm
Settings.chunk_size = 100
Settings.chunk_overlap = 25

embed_model_name = "BAAI/bge-small-en-v1.5"
# embed_model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

# # for p in embed_model._model.named_parameters():
# #     p[1].requires_grad = False

# # KeywordTableSimpleRetriever
# index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
# # configure retriever
# retriever = VectorIndexRetriever(
#     index=index,
#     similarity_top_k=2,
# )
# # configure response synthesizer
# response_synthesizer = get_response_synthesizer(
#     response_mode="tree_summarize",
# )
# # assemble query engine
# query_engine = RetrieverQueryEngine(
#     retriever=retriever,
#     response_synthesizer=response_synthesizer,
# )

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
# index = VectorStoreIndex.from_documents(final_split_dfs, embed_model=embed_model)
# index.storage_context.persist(persist_dir="/workspace/data/storage/alpha")
# query_engine = index.as_query_engine(llm=rm_llm)
query_engine = index.as_retriever(similarity_top_k=2)

# Settings.embed_model = embed_model

# index.set_index_id("vector_index")
# index.storage_context.persist("/workspace/data/storage")
# storage_context = StorageContext.from_defaults(persist_dir="/workspace/data/storage")
# index = load_index_from_storage(storage_context, index_id="vector_index")
# query_engine = index.as_query_engine(response_mode="tree_summarize")

dspy_llm = DspyLlamaIndexWrapper(rm_llm, model_type='chat', max_new_tokens=30)
dspy.settings.configure(lm=dspy_llm)


In [None]:
# Question: Get the value for Return of Maximum Offering Amount.
# Extracted values: Return of Maximum Offering Amount: 44386706.96773932
# Question: What is the return on maximum offering amount? Please provide a floating point number less than zero.
# Extracted values: Return of Maximum Offering Amount: -77670566.54709445

# Fine Tuning

import sys
sys.setrecursionlimit(10000)

In [None]:
gt_collect = {}
for row,col in ground_truth.iterrows():
    if isinstance(col.values[0], int):
        name = f"Cashflows {col.values[0]}"
    else:
        name = col.values[0]
    value = col.values[1]
    gt_collect[name] = str(value)

In [None]:
import random
# dfs_str = get_csv_string(dfs)
num_rounds = 2
train_data = []
for _ in range(num_rounds):
    # TODO: gradually increase n_samples, random fill in of values in range
    # dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=15)
    # dfs_str = get_csv_string(dfs_aug)
    # dfs_str = get_csv_string(dfs)
    
    for value_to_extract in gt_collect:

        question = f"Extract the value for the variable name '{value_to_extract}'?"
        answer = f"{value_to_extract}: {gt_collect[value_to_extract]}"
        train_data.append(dspy.Example(question=question, answer=answer).with_inputs('question'))
    
random.shuffle(train_data)


In [None]:
%load_ext autoreload
%autoreload 2

from train_utils import operators_dict, range_description_json
from models_testing import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict, query_engine=query_engine, num_passages=3)

In [None]:
# in dspy.primitives.module.py
# def reset_copy(self):
#     import llama_index
#     obj = copy.deepcopy(self)
#     ######################################################
#     for attribute_name in dir(obj):
#         if not attribute_name.startswith('_'):
#             attribute_value = getattr(obj, attribute_name)
#             if isinstance(attribute_value, llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever):
#                 setattr(obj, attribute_name, getattr(self, attribute_name))
#     ######################################################
#     for param in obj.parameters():
#         param.reset()

#     return obj

############################or############################# 

# in llama_index/core/schema.py comment out line 84
# 84  state["__private_attribute_values__"] = {}


In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune, BootstrapFewShot, MIPRO, MIPROv2

perc_train = 0.7
num_train = int(len(train_data) * perc_train)
def validate_answer(pred, example, trace=None):
    return example.answer.lower() == pred.answer.lower()
metric = dspy.evaluate.metrics.answer_exact_match
# metric = dspy.evaluate.metrics.answer_passage_match
# metric = validate_answer
NUM_THREADS=1
TRAIN_NUM=17

#Configure model to finetune
# config = dict(bf16=True, bsize=1, accumsteps=3, lr=8e-5) #path_prefix=None

#Compile program on BootstrapFinetune

# finetune_optimizer = MIPROv2(prompt_model=llm, task_model=llm, metric=metric, num_candidates=10, init_temperature=1.2, minibatch_size=1)
# kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=0)
# compiled_prompt_opt = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:TRAIN_NUM], 
#                                                  num_batches=200, max_bootstrapped_demos=3, max_labeled_demos=5, 
#                                                  eval_kwargs=kwargs, requires_permission_to_run=False)

# finetune_optimizer = MIPRO(prompt_model=llm, task_model=llm, metric=metric, num_candidates=10, init_temperature=1.2)
# kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=0)
# compiled_prompt_opt = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:TRAIN_NUM], num_trials=100, 
#                                                  max_bootstrapped_demos=3, max_labeled_demos=5, eval_kwargs=kwargs,
#                                                  requires_permission_to_run=False)

config = dict(target=model_name, epochs=3, bf16=True, bsize=1, accumsteps=3, lr=7e-5) #path_prefix=None
finetune_optimizer = BootstrapFinetune(metric=metric)
finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data, **config)

# finetune_optimizer = BootstrapFewShot(metric=metric, max_bootstrapped_demos=8, max_labeled_demos=8)
# finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data)

# #Load program and activate model's parameters in program before evaluation
# ckpt_path = "saved_checkpoint_path_from_finetuning"
# LM = dspy.HFModel(checkpoint=ckpt_path, model=model_name)

# for p in finetune_program.predictors():
#     p.lm = LM
#     p.activated = False

In [None]:
#Load program and activate model's parameters in program before evaluation
ckpt_path = "/workspace/repos/finetuning_ckpts/NPBHZ93JIRMS2.all/checkpoint-120"
LM = dspy.HFModel(checkpoint=ckpt_path, model=model_name, hf_device_map='cuda:0')

for p in spreadsheeet_ananlyst.predictors():
    p.lm = LM
    p.activated = False

# perc_train = 0.7
# num_train = int(len(train_data) * perc_train)
# metric = dspy.evaluate.metrics.answer_exact_match

scores = []
for x in train_data[num_train:num_train+34]:
    pred = finetune_program(**x.inputs())
    score = metric(x, pred)
    scores.append(score)
np.mean(scores)

saved_checkpoint_path_from_finetuning = '/workspace/repos/finetuning_ckpts/NFAI903XCHAMQ.all/checkpoint-53'
llm.model=None
llm.model=AutoModelForCausalLM.from_pretrained(saved_checkpoint_path_from_finetuning, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.bfloat16)

In [None]:
%load_ext autoreload
%autoreload 2

from train_utils import operators_dict, range_description_json
from models_testing import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict, query_engine=query_engine, num_passages=3)

In [None]:
# dfs_str = get_csv_string(dfs)
collection = []
for value_to_extract in gt_collect:
    # if value_to_extract=="Selling Costs":
        # continue
    question = f"Extract the value for the variable name '{value_to_extract}'?"
    print(question)
    pred = spreadsheeet_ananlyst(question, verbose=True)
    print(pred.answer)
    collection.append((pred, f"{value_to_extract}: {gt_collect[value_to_extract]}"))
    break

In [None]:
print(pred.answer)

In [None]:
for i in collection:
    print(i[0].answer,"---", i[1])

In [None]:
np.mean([i[0].answer == i[1] for i in collection])

In [None]:
# dfs_str = get_csv_string(dfs)
collection = []
for value_to_extract in gt_collect:
    question = f"Extract the value for the variable name '{value_to_extract}'?"
    print(question)
    pred = finetune_program(question, verbose=True)
    print(pred.answer)
    collection.append((pred, f"{value_to_extract}: {gt_collect[value_to_extract]}"))


In [None]:
for i in collection:
    print(i[0].answer,"---", i[1])

In [None]:
np.mean([i[0].answer == i[1] for i in collection])

In [None]:

llm.model()

In [None]:
from dspy.teleprompt.signature_opt_typed import optimize_signature
from dspy.evaluate.metrics import answer_exact_match
from dspy.functional import TypedChainOfThought

compiled_program = optimize_signature(
    student=TypedChainOfThought("question -> answer"),
    evaluator=Evaluate(devset=devset, metric=answer_exact_match, num_threads=10, display_progress=True),
    n_iterations=50,
).program