# for vast ai - enter in terminal
!python3 -m pip install ipykernel -U --user --force-reinstall && apt update && apt install -y python3-pip

!pip3 install llama-index llama-parse llama-index-embeddings-huggingface accelerate dspy-ai openpyxl langchain chromadb
!pip3 install flash-attn --no-build-isolation
!pip3 install sentencepiece protobuf evaluate
!cp /workspace/repos/agentic-ai/MASTER\ -\ PYTHON\ -\ SCORING\ MODEL\ -\ MCG\ MADISON\ RIDGE\ DST\ -\ v2.0.xlsx /workspace/data

!pip3 install rouge_score absl-py

In [None]:
import gc
import os

import pandas as pd
import numpy as np

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
# from llama_index.core.embeddings import resolve_embed_model
# import chromadb
# from chromadb.utils import embedding_functions
# from langchain.text_splitter import SentenceTransformersTokenTextSplitter
# from llama_index.readers.file import PandasExcelReader
# CHROMA_COLLECTION_NAME = "blockchain_and_ai"
# CHROMADB_DIR = "/workspace/data/db/"

from typing import List, Any, Callable, Optional
from pydantic import BaseModel

import torch
from transformers import AutoModelForCausalLM
from dspy.retrieve.chromadb_rm import ChromadbRM

from train_utils import get_csv_string, randomize_row_values, operators_dict, range_description_json
from models import SpreadSheetAnalyzer

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')


In [None]:
# model_name = "EleutherAI/gpt-neo-125m"
# model_name = "clibrain/mamba-2.8b-instruct-openhermes"
# model_name = "microsoft/Phi-3-mini-128k-instruct" # 128K context window
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # 8K context window
# model_name = "clibrain/mamba-2.8b-instruct-openhermes" # 8K context window
print('first model load...')
model_name = "Qwen/Qwen2-1.5B-Instruct"
access_token = os.getenv('HF_TOKEN')
# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
llm = dspy.HFModel(model=model_name, hf_device_map='auto', token=access_token)
llm.kwargs['max_new_tokens']=100
llm.kwargs['repetition_penalty']=1.1
llm.kwargs['do_sample']=False
# llm.kwargs['typical_p']=0.9
# llm.kwargs['temperature']=0.9
# llm.tokenizer.return_full_text = False


print('deleting model...')
llm.model=None
gc.collect()
print('reloading model...')
llm.model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.float16)

# llm.model.generation_config.pad_token_id = llm.tokenizer.eos_token_id
# llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

dspy.settings.configure(lm=llm)

In [None]:
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
disposition_inputs = [
  "Selling Costs",
  "Disposition Fee",
  "Net Operating Income",
  "Loan Assumption/Payoff",
  "Return of Forecasted Reserves",
  "CF Y 11",
  "Return of Maximum Offering Amount",
  "Projected Terminal Cap Rate",
  "Cash Flows"
]
dfs = pd.read_excel(filepath, sheet_name="5 - Disposition Analysis", header=None)
dfs.dropna(axis=0, how='all', inplace=True)
dfs.dropna(axis=1, how='all', inplace=True)
fee_columns = ['Disposition Fee', 'Selling Costs']
cashflow_columns = [1,2,3,4,5,6,7,8,9]
ground_truth = dfs[dfs[1].isin(disposition_inputs+cashflow_columns)].iloc[:, :2] # Get only the necessary columns
ground_truth.drop(labels=[16, 17], axis=0, inplace=True) # drop the duplicate Selling and Disposition Costs



In [None]:
gt_collect = {}
for row,col in ground_truth.iterrows():
    # if isinstance(col.values[0], int):
    #     name = f"Cash Flows {col.values[0]}"
    # else:
    name = col.values[0]
    value = col.values[1]
    gt_collect[name] = str(value)

from llama_index.readers.file import PandasExcelReader
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
docs = PandasExcelReader(sheet_name="5 - Disposition Analysis", pandas_config={'keep_default_na':False}).load_data(filepath)

In [None]:
%load_ext autoreload
%autoreload 2

from train_utils import get_csv_string, randomize_row_values, operators_dict, range_description_json
from models import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict)

In [None]:
dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=16)
dfs_str = get_csv_string(dfs_aug)
# dfs_str = get_csv_string(dfs)
collection = []
for value_to_extract in range_description_json:
    # if 'Cash Flows' not in value_to_extract:
    #     continue
    # value_to_extract = 'Return of Maximum Offering Amount'
    print('Extracting value for:', value_to_extract)
    question = f"Get the value for: {value_to_extract}."

    parsed_name, parsed_values = spreadsheeet_ananlyst(dfs_str, question, verbose=True)
    collection.append((parsed_name, parsed_values))
    # print(range_description_json[value_to_extract])
    # print(parsed_name, parsed_values)
    # print()


In [None]:
acc = []
for vname, vvalue in collection:
    if vname in gt_collect:
        acc.append(vvalue == gt_collect[vname])
    else:
        acc.append(False)
np.mean(acc)

In [None]:
# baseline = 0.35294117647058826
# 0.29411764705882354
# 0.23529411764705882
# 0.17647058823529413
# 0.17647058823529413
# 0.23529411764705882

In [None]:
collection

In [None]:
raise

In [None]:
# start with getting the correct value, then move values around in the spreadsheet

In [None]:
# Question: Get the value for Return of Maximum Offering Amount.
# Extracted values: Return of Maximum Offering Amount: 44386706.96773932
# Question: What is the return on maximum offering amount? Please provide a floating point number less than zero.
# Extracted values: Return of Maximum Offering Amount: -77670566.54709445

# Fine Tuning

In [None]:
gt_collect = {}
for row,col in ground_truth.iterrows():
    # if isinstance(col.values[0], int):
    #     name = f"Cash Flows {col.values[0]}"
    # else:
    name = col.values[0]
    value = col.values[1]
    gt_collect[name] = str(value)

In [None]:
dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=15)
dfs_str = get_csv_string(dfs_aug)
# dfs_str = get_csv_string(dfs)
num_rounds = 10
train_data = []
for _ in range(num_rounds):
    dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=15)
    dfs_str = get_csv_string(dfs_aug)
    
    for value_to_extract in gt_collect:

        question = f"Get the value for: {value_to_extract}."
        answer = f"{value_to_extract}: {gt_collect[value_to_extract]}"
        train_data.append(dspy.Example(question=question, data=dfs_str, answer=answer).with_inputs('question', 'data'))

    



In [None]:
%load_ext autoreload
%autoreload 2

from train_utils import get_csv_string, randomize_row_values, operators_dict, range_description_json
from models import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict)

In [None]:
dir(finetune_optimizer.teleprompter)

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune
perc_train = 0.7
num_train = int(len(train_data) * perc_train)
metric = dspy.evaluate.metrics.answer_exact_match

#Configure model to finetune
config = dict(target=model_name, epochs=2, bf16=True, bsize=32, accumsteps=1, lr=5e-5) #path_prefix=None

#Compile program on BootstrapFinetune
finetune_optimizer = BootstrapFinetune(metric=None)
finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:num_train], **config)

# finetune_program = spreadsheeet_ananlyst

# #Load program and activate model's parameters in program before evaluation
# ckpt_path = "saved_checkpoint_path_from_finetuning"
# LM = dspy.HFModel(checkpoint=ckpt_path, model=llm)

# for p in finetune_program.predictors():
#     p.lm = LM
#     p.activated = False

In [None]:
scores = []
for x in train_data[num_train:num_train+10]:
    pred = spreadsheeet_ananlyst(**x.inputs())
    score = metric(x, pred)
    scores.append(score)
np.mean(scores)

In [None]:
dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=16)
dfs_str = get_csv_string(dfs_aug)
collection = []
for value_to_extract in gt_collect:
    print('Extracting value for:', value_to_extract)
    question = f"Get the value for: {value_to_extract}."

    pred = spreadsheeet_ananlyst(dfs_str, question, verbose=True)
    collection.append((pred, f"{value_to_extract}: {gt_collect[value_to_extract]}"))


In [None]:
for i in collection:
    print(i[0].answer, i[1])

In [None]:
np.mean([x[0].answer == x[1] for x in collection])

In [None]:
from dspy.teleprompt.signature_opt_typed import optimize_signature
from dspy.evaluate.metrics import answer_exact_match
from dspy.functional import TypedChainOfThought

compiled_program = optimize_signature(
    student=TypedChainOfThought("question -> answer"),
    evaluator=Evaluate(devset=devset, metric=answer_exact_match, num_threads=10, display_progress=True),
    n_iterations=50,
).program

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune
perc_train = 0.7
num_train = int(len(train_data) * perc_train)
metric = dspy.evaluate.metrics.answer_exact_match

#Compile program on current dspy.settings.lm
fewshot_optimizer = BootstrapFewShotWithRandomSearch(metric=metric, max_bootstrapped_demos=2, num_threads=1)
your_dspy_program_compiled = tp.compile(spreadsheeet_ananlyst, trainset=train_data[:num_train], valset=train_data[num_train:])

#Configure model to finetune
config = dict(target=llm.model, epochs=2, bf16=True, bsize=1, accumsteps=2, lr=5e-5)

#Compile program on BootstrapFinetune
finetune_optimizer = BootstrapFinetune(metric=metric)
finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=some_new_dataset_for_finetuning_model, **config)

finetune_program = spreadsheeet_ananlyst

#Load program and activate model's parameters in program before evaluation
ckpt_path = "saved_checkpoint_path_from_finetuning"
LM = dspy.HFModel(checkpoint=ckpt_path, model=llm.model)

for p in finetune_program.predictors():
    p.lm = LM
    p.activated = False