# for vast ai - enter in terminal
!python3 -m pip install ipykernel -U --user --force-reinstall && apt update && apt install -y python3-pip

!pip3 install llama-index llama-parse llama-index-embeddings-huggingface dspy-ai==2.4.11 openpyxl langchain chromadb
!pip3 install flash-attn --no-build-isolation
!pip3 install sentencepiece protobuf evaluate rouge_score absl-py tensorboardX bitsandbytes peft accelerate
!cp /workspace/repos/agentic-ai/MASTER\ -\ PYTHON\ -\ SCORING\ MODEL\ -\ MCG\ MADISON\ RIDGE\ DST\ -\ v2.0.xlsx /workspace/data

!pip3 install dspy-ai

In [1]:
import gc
import os

import pandas as pd
import numpy as np
from transformers import BitsAndBytesConfig

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
# from llama_index.core.embeddings import resolve_embed_model

import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from llama_index.readers.file import PandasExcelReader
CHROMA_COLLECTION_NAME = "blockchain_and_ai"
CHROMADB_DIR = "/workspace/data/db/"

from typing import List, Any, Callable, Optional
from pydantic import BaseModel

import torch
from transformers import AutoModelForCausalLM
from dspy.retrieve.chromadb_rm import ChromadbRM

from train_utils import get_csv_string, randomize_row_values, operators_dict, range_description_json
from models import SpreadSheetAnalyzer

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')


True

In [2]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Function to split DataFrame by empty columns
def split_df_by_empty_columns(df):
    # Identify indices of empty columns
    empty_cols = df.columns[df.isna().all()].tolist()
    # Split DataFrame by empty columns
    sub_dfs = np.split(df, df.columns.get_indexer(empty_cols) + 1, axis=1)
    # Filter out the empty DataFrames (which correspond to the empty columns)
    sub_dfs = [sub_df.dropna(axis=1, how='all') for sub_df in sub_dfs]
    return sub_dfs

# Function to split a DataFrame by empty rows
def split_df_by_empty_rows(df):
    # Identify indices of empty rows
    empty_rows = df.index[df.isna().all(axis=1)].tolist()
    # Split DataFrame by empty rows
    sub_dfs = np.split(df, df.index.get_indexer(empty_rows) + 1, axis=0)
    # Filter out the empty DataFrames (which correspond to the empty rows)
    sub_dfs = [sub_df.dropna(axis=0, how='all') for sub_df in sub_dfs]
    return sub_dfs


# Displaying the final split DataFrames
# for i, final_df in enumerate(final_split_dfs, 1):
#     print(f"DataFrame {i}:\n{final_df}\n")

In [3]:
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
disposition_inputs = [
  "Selling Costs",
  "Disposition Fee",
  "Net Operating Income",
  "Loan Assumption/Payoff",
  "Return of Forecasted Reserves",
  "CF Y 11",
  "Return of Maximum Offering Amount",
  "Projected Terminal Cap Rate",
  "Cash Flows"
]
dfs = pd.read_excel(filepath, sheet_name="5 - Disposition Analysis", header=None)
# Splitting the DataFrame by empty columns
sub_dfs_by_columns = split_df_by_empty_columns(dfs)

# Splitting each sub-DataFrame by empty rows
final_split_dfs = []
for sub_df in sub_dfs_by_columns:
    split_sub_dfs = split_df_by_empty_rows(sub_df)
    final_split_dfs.extend([get_csv_string(x) for x in split_sub_dfs if not x.empty])

dfs.dropna(axis=0, how='all', inplace=True)
dfs.dropna(axis=1, how='all', inplace=True)
fee_columns = ['Disposition Fee', 'Selling Costs']
cashflow_columns = [1,2,3,4,5,6,7,8,9]
ground_truth = dfs[dfs[1].isin(disposition_inputs+cashflow_columns)].iloc[:, :2] # Get only the necessary columns
ground_truth.drop(labels=[16, 17], axis=0, inplace=True) # drop the duplicate Selling and Disposition Costs



  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)


In [4]:
access_token = os.getenv('HF_TOKEN')
print('first model load...')
# model_name = "EleutherAI/gpt-neo-125m"
# model_name = "microsoft/Phi-3-mini-128k-instruct" # 128K context window
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # 8K context window
# model_name = "clibrain/mamba-2.8b-instruct-openhermes" # 8K context window
model_name = "Qwen/Qwen2-1.5B-Instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
llm = dspy.HFModel(model=model_name, hf_device_map='auto', token=access_token)
llm.kwargs['max_new_tokens']=30
# llm.kwargs['repetition_penalty']=1.1
llm.kwargs['temperature']=None
llm.kwargs['do_sample']=False
llm.kwargs['top_k']=None
# llm.kwargs['typical_p']=0.9

print('deleting model...')
llm.model=None
gc.collect()
print('reloading model...')

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

llm.model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.bfloat16)


# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["k_proj", "v_proj", "q_proj", "o_proj"], # Mistral param names
#     lora_dropout=0.05,
#     bias="none", #"none", "all", "lora_only"
#     task_type="CAUSAL_LM", 
    
# )

# llm.model = prepare_model_for_kbit_training(llm.model)
# llm.model = get_peft_model(llm.model, config)
# print_trainable_parameters(llm.model)

if model_name == 'mistralai/Mistral-7B-Instruct-v0.3':
    llm.model.generation_config.pad_token_id = llm.tokenizer.eos_token_id
    llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

# dspy.settings.configure(lm=llm)

######## RAG model
chroma_client = chromadb.PersistentClient(path=CHROMADB_DIR)
collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
# text_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=100)

ids = []
documents = []
metadatas = []
# dfs_str = get_csv_string(dfs)
# chunks = text_splitter.create_documents([dfs_str], )
for chunk_no, chunk in enumerate(final_split_dfs):
    ids.append(f"{chunk_no}")
    documents.append(chunk)
    # metadatas.append({"title":})
if ids:
    collection.upsert(ids=ids, documents=documents)#, metadatas=metadatas)

# default_ef = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
default_ef = embedding_functions.DefaultEmbeddingFunction()
retriever = ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, default_ef, k=3)

dspy.settings.configure(lm=llm, rm=retriever)

first model load...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


deleting model...
reloading model...


from llama_index.readers.file import PandasExcelReader
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
docs = PandasExcelReader(sheet_name="5 - Disposition Analysis", pandas_config={'keep_default_na':False}).load_data(filepath)

In [5]:
# Question: Get the value for Return of Maximum Offering Amount.
# Extracted values: Return of Maximum Offering Amount: 44386706.96773932
# Question: What is the return on maximum offering amount? Please provide a floating point number less than zero.
# Extracted values: Return of Maximum Offering Amount: -77670566.54709445

# Fine Tuning

In [6]:
gt_collect = {}
for row,col in ground_truth.iterrows():
    if isinstance(col.values[0], int):
        name = f"Cashflows {col.values[0]}"
    else:
        name = col.values[0]
    value = col.values[1]
    gt_collect[name] = str(value)

In [7]:
# dfs_str = get_csv_string(dfs)
num_rounds = 1
train_data = []
for _ in range(num_rounds):
    # TODO: gradually increase n_samples, random fill in of values in range
    # dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=15)
    # dfs_str = get_csv_string(dfs_aug)
    # dfs_str = get_csv_string(dfs)
    
    for value_to_extract in gt_collect:

        question = f"Extract the value for the variable name '{value_to_extract}'?"
        answer = f"{value_to_extract}: {gt_collect[value_to_extract]}"
        train_data.append(dspy.Example(question=question, answer=answer).with_inputs('question'))

In [8]:
%load_ext autoreload
%autoreload 2

from train_utils import operators_dict, range_description_json
from models import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict, num_passages=1)

In [9]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune
perc_train = 0.7
num_train = int(len(train_data) * perc_train)
def validate_answer(pred, example, trace=None):
    return example.answer.lower() == pred.answer.lower()
metric = dspy.evaluate.metrics.answer_exact_match
# metric = validate_answer

#Configure model to finetune
config = dict(target=model_name, epochs=10, bf16=True, bsize=1, accumsteps=3, lr=7e-5) #path_prefix=None

#Compile program on BootstrapFinetune
finetune_optimizer = BootstrapFinetune(metric=metric)
# finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:num_train], **config)
finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:3], valset=train_data[-17:], **config)

# finetune_program = spreadsheeet_ananlyst

# #Load program and activate model's parameters in program before evaluation
# ckpt_path = "saved_checkpoint_path_from_finetuning"
# LM = dspy.HFModel(checkpoint=ckpt_path, model=model_name)

# for p in finetune_program.predictors():
#     p.lm = LM
#     p.activated = False



100%|██████████| 3/3 [00:20<00:00,  6.81s/it]


Bootstrapped 0 full traces after 3 examples in round 0.
all 0
local_cache/compiler/all.d943856c9b1e8f80.jsonl


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# examples skipped due to parsing error: 0 / 0


KeyError: 'Column input_ids not in the dataset. Current columns in the dataset: []'

In [None]:
!rm -rf local_cache/compiler

In [None]:
!ls local_cache/compiler

In [None]:
#Load program and activate model's parameters in program before evaluation
ckpt_path = "/workspace/repos/finetuning_ckpts/5YJOF3IJ3E8KP.all/checkpoint-96"
LM = dspy.HFModel(checkpoint=ckpt_path, model=model_name)

for p in finetune_program.predictors():
    p.lm = LM
    p.activated = False

In [None]:
# perc_train = 0.7
# num_train = int(len(train_data) * perc_train)
# metric = dspy.evaluate.metrics.answer_exact_match

scores = []
for x in train_data[num_train:num_train+34]:
    pred = finetune_program(**x.inputs())
    score = metric(x, pred)
    scores.append(score)
np.mean(scores)

In [None]:
saved_checkpoint_path_from_finetuning = '/workspace/repos/finetuning_ckpts/NFAI903XCHAMQ.all/checkpoint-53'
llm.model=None
llm.model=AutoModelForCausalLM.from_pretrained(saved_checkpoint_path_from_finetuning, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.bfloat16)

In [None]:
# dfs_str = get_csv_string(dfs)
collection = []
for value_to_extract in gt_collect:
    question = f"Extract the value for the variable name '{value_to_extract}'?"
    print(question)
    pred = spreadsheeet_ananlyst(question, verbose=True)
    collection.append((pred, f"{value_to_extract}: {gt_collect[value_to_extract]}"))


In [None]:
for i in collection:
    print(i[0].answer,"---", i[1])

In [None]:
np.mean([i[0].answer == i[1] for i in collection])

In [None]:
# baseline RAG and extractor only 0.35294117647058826
# baseline RAG, extractor, float and format checks 0.35294117647058826

In [None]:
finetune_program

In [None]:
from dspy.teleprompt.signature_opt_typed import optimize_signature
from dspy.evaluate.metrics import answer_exact_match
from dspy.functional import TypedChainOfThought

compiled_program = optimize_signature(
    student=TypedChainOfThought("question -> answer"),
    evaluator=Evaluate(devset=devset, metric=answer_exact_match, num_threads=10, display_progress=True),
    n_iterations=50,
).program

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune
perc_train = 0.7
num_train = int(len(train_data) * perc_train)
metric = dspy.evaluate.metrics.answer_exact_match

#Compile program on current dspy.settings.lm
fewshot_optimizer = BootstrapFewShotWithRandomSearch(metric=metric, max_bootstrapped_demos=2, num_threads=1)
your_dspy_program_compiled = tp.compile(spreadsheeet_ananlyst, trainset=train_data[:num_train], valset=train_data[num_train:])

#Configure model to finetune
config = dict(target=llm.model, epochs=2, bf16=True, bsize=1, accumsteps=2, lr=5e-5)

#Compile program on BootstrapFinetune
finetune_optimizer = BootstrapFinetune(metric=metric)
finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=some_new_dataset_for_finetuning_model, **config)

finetune_program = spreadsheeet_ananlyst

#Load program and activate model's parameters in program before evaluation
ckpt_path = "saved_checkpoint_path_from_finetuning"
LM = dspy.HFModel(checkpoint=ckpt_path, model=llm.model)

for p in finetune_program.predictors():
    p.lm = LM
    p.activated = False