!pip3 install llama-index llama-parse llama-index-embeddings-huggingface accelerate dspy-ai openpyxl langchain chromadb
!pip3 install flash-attn --no-build-isolation

!pip3 install sentencepiece protobuf

!cp /workspace/repos/agentic-ai/MASTER\ -\ PYTHON\ -\ SCORING\ MODEL\ -\ MCG\ MADISON\ RIDGE\ DST\ -\ v2.0.xlsx /workspace/data

In [None]:
import json
import gc
import io
import os

import pandas as pd
import numpy as np
import operator

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
# from llama_index.core.embeddings import resolve_embed_model
# import chromadb
# from chromadb.utils import embedding_functions
# from langchain.text_splitter import SentenceTransformersTokenTextSplitter
# from llama_index.readers.file import PandasExcelReader
# CHROMA_COLLECTION_NAME = "blockchain_and_ai"
# CHROMADB_DIR = "/workspace/data/db/"

from typing import List, Any, Callable, Optional
from pydantic import BaseModel

import torch
from transformers import AutoModelForCausalLM
from dspy.retrieve.chromadb_rm import ChromadbRM

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')


In [None]:
def randomize_row_values(dfs: pd.DataFrame, ground_truth: pd.DataFrame) -> pd.DataFrame:
    """
    Randomly placing values in a dataframe
    """
    dfs_copy = dfs.copy()
    n_samples = np.random.randint(1,len(ground_truth),1)[0]
    testvalues = ground_truth.sample(n=n_samples)
    testidx = testvalues.index

    dfscolumns = dfs_copy.columns
    for row in testvalues.loc[testidx].iterrows():
        random_row = np.random.choice(dfs_copy.index,1)[0]
        random_col = np.random.choice(np.arange(len(dfscolumns)-1)[2:],1)[0]
        random_col1 = dfscolumns[random_col]
        random_col2 = dfscolumns[random_col+1]
        dfs_copy.loc[testidx, :2]=np.nan
        dfs_copy.loc[random_row, [random_col1, random_col2]] = row[1].values
    return dfs_copy


def get_csv_string(dfs: pd.DataFrame) -> str:
    """
    Convert a DataFrame to a CSV formatted string
    """
    # Create a string buffer
    buffer = io.StringIO()

    # Convert the DataFrame to CSV format and write to the buffer
    dfs.to_csv(buffer, index=False)

    # Get the CSV as a string
    csv_string = buffer.getvalue()

    return csv_string



In [None]:
# model_name = "EleutherAI/gpt-neo-125m"
# model_name = "clibrain/mamba-2.8b-instruct-openhermes"
# model_name = "microsoft/Phi-3-mini-128k-instruct" # 128K context window
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # 8K context window
# model_name = "clibrain/mamba-2.8b-instruct-openhermes" # 8K context window
print('first model load...')
model_name = "Qwen/Qwen2-1.5B-Instruct"
access_token = os.getenv('HF_TOKEN')
# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
llm = dspy.HFModel(model=model_name, hf_device_map='auto', token=access_token)
llm.kwargs['max_new_tokens']=100
llm.kwargs['repetition_penalty']=1.1
llm.kwargs['do_sample']=False
# llm.kwargs['typical_p']=0.9
llm.kwargs['temperature']=0.9
# llm.tokenizer.return_full_text = False


print('deleting model...')
llm.model=None
gc.collect()
print('reloading model...')
llm.model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.float16)

# llm.model.generation_config.pad_token_id = llm.tokenizer.eos_token_id
# llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

dspy.settings.configure(lm=llm)

In [None]:
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
disposition_inputs = [
  "Selling Costs",
  "Disposition Fee",
  "Net Operating Income",
  "Loan Assumption/Payoff",
  "Return of Forecasted Reserves",
  "CF Y 11",
  "Return of Maximum Offering Amount",
  "Projected Terminal Cap Rate",
  "Cash Flows"
]
dfs = pd.read_excel(filepath, sheet_name="5 - Disposition Analysis", header=None)
dfs.dropna(axis=0, how='all', inplace=True)
dfs.dropna(axis=1, how='all', inplace=True)
fee_columns = ['Disposition Fee', 'Selling Costs']
cashflow_columns = [1,2,3,4,5,6,7,8,9]
ground_truth = dfs[dfs[1].isin(disposition_inputs+cashflow_columns)].iloc[:, :2] # Get only the necessary columns
ground_truth.drop(labels=[16, 17], axis=0, inplace=True) # drop the duplicate Selling and Disposition Costs



from llama_index.readers.file import PandasExcelReader
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
docs = PandasExcelReader(sheet_name="5 - Disposition Analysis", pandas_config={'keep_default_na':False}).load_data(filepath)

In [None]:
def is_float(s):
    s=s.replace('$', '')
    s=s.replace('%', '')
    try:
        float(s)
        return True
    except ValueError:
        return False

def parse_output(output: str, field: str) -> str:
    field = field+': '
    parsed_out = output.split(field)[-1]
    if '---' in parsed_out:
        return parsed_out.split('---')[0].strip()
    else:
        return parsed_out.split('\n')[0].strip()
    
def is_in_range(value, bounds, ops=(operator.ge, operator.le)):
    """
    Check if a value falls within a range based on the provided operators.
    
    Parameters:
    - value: The float value to check.
    - lower: The lower and upper bounds limit of the range.
    - ops: A tuple of two functions from the operator module, where
           ops[0] is used for comparing the value with the lower limit,
           and ops[1], if upper is not None, for comparing the value with the upper limit.
           Defaults to greater than or equal to for lower and less than or equal to for upper.
           
    Returns:
    - True if the value is within the range based on the operators; False otherwise.
    """
    if len(bounds)==2: lower, upper = bounds
    else: lower, upper = bounds[0], None
    if upper is None:
        return ops[0](value, lower)
    else:
        return ops[0](value, lower) and ops[1](value, upper)

class SpreadsheetValueExtractor(dspy.Signature):
    """Extract the values for variable names contained in the context."""

    question = dspy.InputField(format=str)
    context = dspy.InputField(format=str, desc="json string representation of a spreadsheet.")
    answer = dspy.OutputField(desc='{variable name}: {extracted value}.')

class FormatCorrectQuestion(dspy.Signature):
    """The extracted value for the given variable name is in the wrong format range.
    Rephrase the question and include the format description."""

    question = dspy.InputField(format=str, desc="The original question.")
    extracted_value = dspy.InputField(format=str)
    format_description = dspy.InputField(format=str)
    corrected_question = dspy.OutputField(format=str)

class FloatQuestionCorrector(dspy.Signature):
    """The extracted value for the given variable name cannot be converted to a float. 
    Rephrase the question to focus on extracting a float value for the variable name 
    given in the question."""

    question = dspy.InputField(format=str, desc="The original question.")
    extracted_value = dspy.InputField(format=str)
    corrected_float_question = dspy.OutputField(format=str)

class SpreadSheetAnalyzer(dspy.Module):
    def __init__(self, range_description_json, operators_dict):
        super().__init__()
        self.range_description_json = range_description_json
        self.operators_dict = operators_dict
        self.extraction = dspy.Predict(SpreadsheetValueExtractor)
        self.question_rewriter = dspy.Predict(FormatCorrectQuestion)
        self.float_question_corrector = dspy.Predict(FloatQuestionCorrector)

    def correct_float_question(self, question, extracted_value, data, max_attempts=3, verbose=False):
        for _ in range(max_attempts):
            if verbose: print('Float Question Failed:', question)
            rewritten_out = self.float_question_corrector(question=question, extracted_value=extracted_value)
            question = parse_output(rewritten_out.corrected_float_question, 'Corrected Float Question')
            if verbose: print('Float Question Corrected:', question)
            extracted_out = self.extraction(question=question, context=data)
            extracted_value = parse_output(extracted_out.answer, 'Answer')
            if is_float(extracted_value.split(': ')[-1]):
                return extracted_value, question
        return extracted_value, question

    def correct_format_question(self, question, data, parsed_name, extracted_value, max_attempts=3, verbose=False):
        for _ in range(max_attempts):
            if verbose: print('Format Question Failed:', question)
            rewritten_out = self.question_rewriter(question=question, extracted_value=extracted_value, format_description=self.range_description_json[parsed_name])
            question = parse_output(rewritten_out.corrected_question, 'Corrected Question')
            if verbose: print('Format Question Corrected:', question)
            extracted_out = self.extraction(question=question, context=data)
            extracted_value = parse_output(extracted_out.answer, 'Answer')
            parsed_values = float(extracted_value.split(': ')[-1])
            if is_in_range(parsed_values, bounds=self.operators_dict[parsed_name]['bounds'], ops=self.operators_dict[parsed_name]['operators']):
                return parsed_values, question
        return parsed_values, question

    def forward(self, data, question, verbose=False):
        extracted_out = self.extraction(question=question, context=data)
        extracted_value = parse_output(extracted_out.answer, 'Answer')
        parsed_output = extracted_value.split(': ')
        parsed_values, parsed_name = parsed_output[-1], parsed_output[0]
        if verbose: print(f'Parsed Name: {parsed_name}, Parsed Values: {parsed_values}')
        # Safeguard - check if the extracted value can be converted to a float
        valid_float_tf = is_float(parsed_values)
        if not valid_float_tf:
            extracted_value, question = self.correct_float_question(question,
                                                                    extracted_value, 
                                                                    data,
                                                                    verbose=verbose)
        else:
            parsed_values = float(parsed_values)

        # Safeguard - check if the extracted value falls within the expected range
        valid_format_tf = is_in_range(parsed_values, 
                                      bounds=self.operators_dict[parsed_name]['bounds'], 
                                      ops=self.operators_dict[parsed_name]['operators'])
        if not valid_format_tf:
            parsed_values, question = self.correct_format_question(question, 
                                                                   data, 
                                                                   parsed_name, 
                                                                   extracted_value,
                                                                   verbose=verbose)

        return parsed_name, parsed_values, valid_format_tf, question

In [None]:
operators_dict = {'Selling Costs': {'operators':(operator.ge, operator.le), 'bounds':(0,1)}, 
                  'Disposition Fee': {'operators':(operator.ge, operator.le), 'bounds':(0,1)}, 
                  'Net Operating Income': {'operators':(operator.ge,), 'bounds':(0,)}, 
                  'Loan Assumption/Payoff': {'operators':(operator.le,), 'bounds':(0,)}, 
                  'Return of Forecasted Reserves': {'operators':(operator.le,), 'bounds':(0,)}, 
                  'CF Y 11': {'operators':(operator.ge,), 'bounds':(0,)}, 
                  'Return of Maximum Offering Amount': {'operators':(operator.le,), 'bounds':(0,)}, 
                  'Projected Terminal Cap Rate': {'operators':(operator.ge, operator.le), 'bounds':(0,1)},
                  'Cash Flows 1': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 2': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 3': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 4': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 5': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 6': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 7': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 8': {'operators':(operator.ge,), 'bounds':(1,)},
                  'Cash Flows 9': {'operators':(operator.ge,), 'bounds':(1,)}}


range_description_json = {'Selling Costs': 'float greater than 0 and less than 1', 
                           'Disposition Fee': 'float greater than 0 and less than 1', 
                           'Net Operating Income': 'float greater than 0', 
                           'Loan Assumption/Payoff': 'float less than 0', 
                           'Return of Forecasted Reserves': 'float less than 0', 
                           'CF Y 11': 'float greater than 0', 
                           'Return of Maximum Offering Amount': 'float less than 0', 
                           'Projected Terminal Cap Rate': 'float greater than 0 and less than 1',
                           'Cash Flows 1': 'float greater than 1',
                           'Cash Flows 2': 'float greater than 1',
                           'Cash Flows 3': 'float greater than 1',
                           'Cash Flows 4': 'float greater than 1',
                           'Cash Flows 5': 'float greater than 1',
                           'Cash Flows 6': 'float greater than 1',
                           'Cash Flows 7': 'float greater than 1',
                           'Cash Flows 8': 'float greater than 1',
                           'Cash Flows 9': 'float greater than 1'}


In [None]:
spreadsheeet_ananalyst = SpreadSheetAnalyzer(range_description_json, operators_dict)

In [None]:
dfs_str = get_csv_string(dfs)
collection = []
for value_to_extract in range_description_json:
    if 'Cash Flows' not in value_to_extract:
        continue
    # value_to_extract = 'Return of Maximum Offering Amount'
    print('Extracting value for:', value_to_extract)
    question = f"Get the value for {value_to_extract}."

    parsed_name, parsed_values, valid_value, question = spreadsheeet_ananalyst(dfs_str, question, verbose=True)
    collection.append((parsed_name, parsed_values, valid_value, question))
    print(range_description_json[value_to_extract])
    print(parsed_name, parsed_values, valid_value, question)
    print()


In [None]:
collection

In [None]:
raise

In [None]:
# start with getting the correct value, then move values around in the spreadsheet

In [None]:
# Question: Get the value for Return of Maximum Offering Amount.
# Extracted values: Return of Maximum Offering Amount: 44386706.96773932
# Question: What is the return on maximum offering amount? Please provide a floating point number less than zero.
# Extracted values: Return of Maximum Offering Amount: -77670566.54709445

# Fine Tuning