!pip3 install llama-index llama-parse llama-index-embeddings-huggingface accelerate dspy-ai openpyxl langchain chromadb
!pip3 install flash-attn --no-build-isolation

!pip3 install sentencepiece protobuf

!cp /workspace/repos/agentic-ai/MASTER\ -\ PYTHON\ -\ SCORING\ MODEL\ -\ MCG\ MADISON\ RIDGE\ DST\ -\ v2.0.xlsx /workspace/data

In [None]:
import json
import gc
import io

import pandas as pd
import numpy as np
import operator

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
# from llama_index.core.embeddings import resolve_embed_model
# import chromadb
# from chromadb.utils import embedding_functions
# from langchain.text_splitter import SentenceTransformersTokenTextSplitter
# from llama_index.readers.file import PandasExcelReader
# CHROMA_COLLECTION_NAME = "blockchain_and_ai"
# CHROMADB_DIR = "/workspace/data/db/"

from typing import List, Any, Callable, Optional
from pydantic import BaseModel

import torch
from transformers import AutoModelForCausalLM
from dspy.retrieve.chromadb_rm import ChromadbRM

from dotenv import load_dotenv
import os
load_dotenv('/workspace/repos/agentic-ai/.env')


In [None]:
def randomize_row_values(dfs: pd.DataFrame, ground_truth: pd.DataFrame) -> pd.DataFrame:
    """
    Randomly placing values in a dataframe
    """
    dfs_copy = dfs.copy()
    n_samples = np.random.randint(1,len(ground_truth),1)[0]
    testvalues = ground_truth.sample(n=n_samples)
    testidx = testvalues.index

    dfscolumns = dfs_copy.columns
    for row in testvalues.loc[testidx].iterrows():
        random_row = np.random.choice(dfs_copy.index,1)[0]
        random_col = np.random.choice(np.arange(len(dfscolumns)-1)[2:],1)[0]
        random_col1 = dfscolumns[random_col]
        random_col2 = dfscolumns[random_col+1]
        dfs_copy.loc[testidx, :2]=np.nan
        dfs_copy.loc[random_row, [random_col1, random_col2]] = row[1].values
    return dfs_copy


def get_csv_string(dfs: pd.DataFrame) -> str:
    """
    Convert a DataFrame to a CSV formatted string
    """
    # Create a string buffer
    buffer = io.StringIO()

    # Convert the DataFrame to CSV format and write to the buffer
    dfs.to_csv(buffer, index=False)

    # Get the CSV as a string
    csv_string = buffer.getvalue()

    return csv_string



In [None]:
# model_name = "EleutherAI/gpt-neo-125m"
# model_name = "clibrain/mamba-2.8b-instruct-openhermes"
# model_name = "microsoft/Phi-3-mini-128k-instruct" # 128K context window
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # 8K context window
# model_name = "clibrain/mamba-2.8b-instruct-openhermes" # 8K context window
print('first model load...')
model_name = "Qwen/Qwen2-1.5B-Instruct"
access_token = os.getenv('HF_TOKEN')
# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
llm = dspy.HFModel(model=model_name, hf_device_map='auto', token=access_token)
llm.kwargs['max_new_tokens']=100
llm.kwargs['repetition_penalty']=1.1
# llm.kwargs['do_sample']=True
# llm.kwargs['typical_p']=0.9
# llm.kwargs['temperature']=0.9
# llm.tokenizer.return_full_text = False


print('deleting model...')
llm.model=None
gc.collect()
print('reloading model...')
llm.model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.float16)

# llm.model.generation_config.pad_token_id = llm.tokenizer.eos_token_id
# llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

dspy.settings.configure(lm=llm)

In [None]:
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
disposition_inputs = [
  "Selling Costs",
  "Disposition Fee",
  "Net Operating Income",
  "Loan Assumption/Payoff",
  "Return of Forecasted Reserves",
  "CF Y 11",
  "Return of Maximum Offering Amount",
  "Projected Terminal Cap Rate",
  "Cash Flows"
]
dfs = pd.read_excel(filepath, sheet_name="5 - Disposition Analysis", header=None)
dfs.dropna(axis=0, how='all', inplace=True)
dfs.dropna(axis=1, how='all', inplace=True)
fee_columns = ['Disposition Fee', 'Selling Costs']
cashflow_columns = [1,2,3,4,5,6,7,8,9]
ground_truth = dfs[dfs[1].isin(disposition_inputs+cashflow_columns)].iloc[:, :2] # Get only the necessary columns
ground_truth.drop(labels=[16, 17], axis=0, inplace=True) # drop the duplicate Selling and Disposition Costs



from llama_index.readers.file import PandasExcelReader
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
docs = PandasExcelReader(sheet_name="5 - Disposition Analysis", pandas_config={'keep_default_na':False}).load_data(filepath)

In [None]:
import pydantic

class SpreadsheetValueExtractor(dspy.Signature):
    """For each of the variable name in the question extract a single value from the data. Exact variable name matches only."""

    question = dspy.InputField(desc="the question from the user will specify multiple variable names.")
    data = dspy.InputField()
    answer = dspy.OutputField(desc="only return the variable name and its value in this format: variable_name: value.")

class ExtractionCleanup(dspy.Signature):
    """Please clean up the extracted values. 
    Extract only the variables and values that are contained in the keys in the format_json."""
    format_json = dspy.InputField()
    extracted_values = dspy.InputField(desc="Contains too much information.")
    clean_list_of_variables_and_values = dspy.OutputField(desc="Only return the variables and values that are contained in the keys in the format_json.")


class OutputFormatVerification(dspy.Signature):
    # """For each variable name in the extracted values string, compare its format to the format_json and return the variable name and value if the format matches."""
    """Please assess the format of the variable names and values in the extracted values string. 
    If the format matches the variable description in format_json, return True and the variable names and values that match the format. 
    If the format does not match, return False and an empty string."""
    format_json = dspy.InputField(desc="The format of the variable names and values.")
    extracted_values = dspy.InputField(desc="The string of variable names and values extracted from the spreadsheet separated by a comma.")
    verification = dspy.OutputField(desc="Only return True or False.")
    verified_values = dspy.OutputField(desc="Output a string of variable names and values.")

# class VariableNameAndValue(pydantic.BaseModel):
#     variable_names: str
#     variable_names_description: str

# class VariableNameAndValues(pydantic.BaseModel):
#     topics: List[VariableNameAndValue]

# class FormatOutput(dspy.Signature):
#     """For each variable name in the extracted values list, compare its format to the format_json and return the variable name and value if the format matches."""
#     format_json: str = dspy.InputField(desc="The format of the variable names and values.")
#     extracted_values: VariableNameAndValues = dspy.InputField(desc="The list of variable names and values extracted from the spreadsheet.")
#     verified_values: str = dspy.OutputField(desc="The list of variable names and values that match the format_json.")
    


class SpreadSheetAnalyzer(dspy.Module):
    def __init__(self):
        super().__init__()

        self.format_validation = dspy.Predict(OutputFormatVerification)
        self.extraction_cleanup = dspy.Predict(ExtractionCleanup)
        self.generate_answer = dspy.Predict(SpreadsheetValueExtractor)
    
    def forward(self, data, question, format_json):
        predicted_value = self.generate_answer(data=data, question=question).answer
        list_of_predicted_values = self.extraction_cleanup(format_json=format_json, extracted_values=predicted_value).clean_list_of_variables_and_values
        print('---------------------------------')
        print(dspy.Prediction(list_of_predicted_values))
        # list_of_predicted_values = str(predicted_value.split('Answer: ')[-1].split(',')[:len(question.split(','))])[1:-1]
        value_verification_output = self.format_validation(format_json=format_json, extracted_values=list_of_predicted_values)
        # reason_for_format_failure = self.value_format_validation_agent(format_json=format_json, extracted_values=predicted_value).reason_for_format_failure
        print('---------------------------------')
        print('$$$$$$ predicted value:\n', predicted_value)
        print('$$$$$$ verification:\n', value_verification_output.verification)
        print('$$$$$$ validated values:\n', value_verification_output.verified_values)
        return dspy.Prediction(context=data, answer=predicted_value.answer)
        # return predicted_value, value_verification_output.verified_values, value_verification_output.verification
        # return value_verification_output
                               


In [None]:
# class SpreadsheetValueExtractor(dspy.Signature):
#     """For each of the variable name in the question extract a single value from the data. Exact variable name matches only."""

#     question = dspy.InputField(desc="the question from the user will specify multiple variable names.")
#     data = dspy.InputField()
#     answer = dspy.OutputField(desc="only return the variable name and its value in this format: variable_name: value.")

# class CorrectQuestion(dspy.Signature):
#     """Given the original extracted value and format description, 
#     modify the original question so that the ."""
#     question = dspy.InputField(format=str, desc="The original question.")
#     extracted_values = dspy.InputField(format=str)
#     reason_for_failure = dspy.InputField(format=str)
#     corrected_question = dspy.OutputField(format=str)

# class ExtractionCleanup(dspy.Signature):
#     """Please clean up the extracted values. 
#     Extract only the variables and values that are contained in the keys in the format_json."""
#     format_json = dspy.InputField()
#     extracted_values = dspy.InputField(desc="Contains too much information.")
#     clean_list_of_variables_and_values = dspy.OutputField(desc="Only return the variables and values that are contained in the keys in the format_json.")

# class CheckValueFormat(dspy.Signature):
#     """Check that the extracted float value in string format matches its format description."""
#     value = dspy.InputField(format=str, desc="String representation of a float.")
#     format_description = dspy.InputField(format=str, desc="JSON string with format descriptions of the values.")
#     value_fits_description = dspy.OutputField(format=str, desc="Only return True or False.")
#     reason_for_failure = dspy.OutputField(format=str)#, desc="Reason the value did not match the given format description.")


def parse_output(output: str, field: str) -> str:
    field = field+': '
    parsed_out = output.split(field)[-1]
    if '---' in parsed_out:
        return parsed_out.split('---')[0].strip()
    else:
        return parsed_out.split('\n')[0].strip()
    
def is_in_range(value, bounds, ops=(operator.ge, operator.le)):
    """
    Check if a value falls within a range based on the provided operators.
    
    Parameters:
    - value: The float value to check.
    - lower: The lower and upper bounds limit of the range.
    - ops: A tuple of two functions from the operator module, where
           ops[0] is used for comparing the value with the lower limit,
           and ops[1], if upper is not None, for comparing the value with the upper limit.
           Defaults to greater than or equal to for lower and less than or equal to for upper.
           
    Returns:
    - True if the value is within the range based on the operators; False otherwise.
    """
    if len(bounds)==2: lower, upper = bounds
    else: lower, upper = bounds[0], None
    if upper is None:
        return ops[0](value, lower)
    else:
        return ops[0](value, lower) and ops[1](value, upper)


class SpreadsheetValueExtractor(dspy.Signature):
    # """Extract the values for variable names contained in the context. Only return the variable name and its value in this format: variable name: value."""
    """Extract the values for variable names contained in the context."""

    question = dspy.InputField(format=str)
    context = dspy.InputField(format=str, desc="json string representation of a spreadsheet.")
    answer = dspy.OutputField(desc='variable name: extracted value.')
    # answer = dspy.OutputField(desc="only return the variable name and its value in this format: variable_name: value.")

class CorrectQuestion(dspy.Signature):
    # """Given the original extracted value and its format description, 
    # modify the original question so that a language model will extract the correct value.
    # Include specifics about the format of the value in the question."""
    """Explain the extracted value is wrong. Rewrite question to extract the correct value using the format description."""
    question = dspy.InputField(format=str, desc="The original question.")
    extracted_value = dspy.InputField(format=str)
    format_description = dspy.InputField(format=str)
    # reason_for_failure = dspy.InputField(format=str)
    corrected_question = dspy.OutputField(format=str)

class SpreadSheetAnalyzer(dspy.Module):
    def __init__(self, format_description_json, operators_dict):
        super().__init__()
        self.format_description_json = format_description_json
        self.operators_dict = operators_dict
        self.extraction = dspy.Predict(SpreadsheetValueExtractor)
        self.question_rewriter = dspy.Predict(CorrectQuestion)

    
    def forward(self, data, question):
        extracted_out = self.extraction(question=question, context=data)
        extrated_value = parse_output(extracted_out.answer, 'Answer')
        parsed_output = extrated_value.split(': ')
        parsed_values = float(parsed_output[-1])
        parsed_name = parsed_output[0]
        valid_value = is_in_range(parsed_values, 
                                  bounds=self.operators_dict[parsed_name]['bounds'], 
                                  ops=self.operators_dict[parsed_name]['operators'])
        while not valid_value:
            print('Question:', question)
            rewritten_question_out = self.question_rewriter(question=question, 
                                                            extracted_value=extrated_value, 
                                                            format_description=self.format_description_json[parsed_name])
            extracted_out = self.extraction(question=question, context=data)
            question = parse_output(rewritten_question_out.corrected_question, 'Corrected Question')
            extracted_value = parse_output(extracted_out.answer, 'Answer')
            print('Extracted values:',extracted_value)
            parsed_output = extracted_value.split(': ')
            parsed_values = float(parsed_output[-1])
            parsed_name = parsed_output[0]
            valid_value = is_in_range(parsed_values, bounds=operators_dict[parsed_name]['bounds'], ops=operators_dict[parsed_name]['operators'])

        return parsed_name, parsed_values, valid_value, question

operators_dict = {'Selling Costs': {'operators':(operator.ge, operator.le), 'bounds':(0,1)}, 
                  'Disposition Fee': {'operators':(operator.ge, operator.le), 'bounds':(0,1)}, 
                  'Net Operating Income': {'operators':(operator.ge,), 'bounds':(0,)}, 
                  'Loan Assumption/Payoff': {'operators':(operator.le,), 'bounds':(0,)}, 
                  'Return of Forecasted Reserves': {'operators':(operator.le,), 'bounds':(0,)}, 
                  'CF Y 11': {'operators':(operator.ge,), 'bounds':(0,)}, 
                  'Return of Maximum Offering Amount': {'operators':(operator.le,), 'bounds':(0,)}, 
                  'Projected Terminal Cap Rate': {'operators':(operator.ge, operator.le), 'bounds':(0,1)}}

format_description_json = {'Selling Costs': 'float greater than 0 and less than 1', 
                           'Disposition Fee': 'float greater than 0 and less than 1', 
                           'Net Operating Income': 'float greater than 0', 
                           'Loan Assumption/Payoff': 'float less than 0', 
                           'Return of Forecasted Reserves': 'float less than 0', 
                           'CF Y 11': 'float greater than 0', 
                           'Return of Maximum Offering Amount': 'float less than 0', 
                           'Projected Terminal Cap Rate': 'float greater than 0 and less than 1'}

In [None]:
spreadsheeet_ananalyst = SpreadSheetAnalyzer(format_description_json, operators_dict)

In [None]:
# value_to_extract = 'Disposition Fee'
for value_to_extract in format_description_json:
    print('Extracting value for:', value_to_extract)
    question = f"Get the value for {value_to_extract}."
    dfs_str = get_csv_string(dfs)

    parsed_name, parsed_values, valid_value, question = spreadsheeet_ananalyst(dfs_str, question)
    print(parsed_name, parsed_values, valid_value, question, format_description_json[value_to_extract])

In [None]:
# start with getting the correct value, then move values around in the spreadsheet

In [None]:
extraction = dspy.Predict(SpreadsheetValueExtractor)
question_rewriter = dspy.Predict(CorrectQuestion)

In [None]:

print(extrated_value)

In [None]:
extrated_value = f'{value_to_extract}: -3.025'
parsed_output = extrated_value.split(': ')
parsed_values = float(parsed_output[-1])
parsed_name = parsed_output[0]
valid_value = is_in_range(parsed_values, bounds=operators_dict[parsed_name]['bounds'], ops=operators_dict[parsed_name]['operators'])
print(valid_value)

In [None]:
if not valid_value:
    rewritten_question_out = question_rewriter(question=question, extracted_value=extrated_value, format_description=format_description_json[parsed_name])
    parsed_rewritten_question = parse_output(rewritten_question_out.corrected_question, 'Corrected Question')
    print(parsed_rewritten_question)