# for vast ai - enter in terminal
!python3 -m pip install ipykernel -U --user --force-reinstall && apt update && apt install -y python3-pip

!pip3 install llama-index llama-parse llama-index-embeddings-huggingface llama-index-llms-huggingface dspy-ai openpyxl langchain chromadb
!pip3 install sentencepiece protobuf evaluate rouge_score absl-py tensorboardX bitsandbytes peft accelerate
!cp /workspace/repos/agentic-ai/MASTER\ -\ PYTHON\ -\ SCORING\ MODEL\ -\ MCG\ MADISON\ RIDGE\ DST\ -\ v2.0.xlsx /workspace/data
!cp /workspace/repos/agentic-ai/PPM\ -\ MCG\ MADISON\ RIDGE\ DST.pdf /workspace/data

!pip3 uninstall -y torch

!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
import gc
import os

import pandas as pd
import numpy as np
from transformers import BitsAndBytesConfig

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext
from llama_index.core.embeddings import resolve_embed_model
from llama_parse import LlamaParse
from llama_index.llms.huggingface import HuggingFaceLLM

import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from llama_index.readers.file import PandasExcelReader
CHROMA_COLLECTION_NAME = "blockchain_and_ai"
CHROMADB_DIR = "/workspace/data/db/"

from typing import List, Any, Callable, Optional
from pydantic import BaseModel

import torch
from transformers import AutoModelForCausalLM
from dspy.retrieve.chromadb_rm import ChromadbRM

from train_utils import get_csv_string, randomize_row_values, operators_dict, range_description_json, split_df_by_empty_columns, split_df_by_empty_rows, print_trainable_parameters
from models import SpreadSheetAnalyzer

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

import nest_asyncio
nest_asyncio.apply()





In [2]:
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
disposition_inputs = [
  "Selling Costs",
  "Disposition Fee",
  "Net Operating Income",
  "Loan Assumption/Payoff",
  "Return of Forecasted Reserves",
  "CF Y 11",
  "Return of Maximum Offering Amount",
  "Projected Terminal Cap Rate",
  "Cash Flows"
]
dfs = pd.read_excel(filepath, sheet_name="5 - Disposition Analysis", header=None)
# Splitting the DataFrame by empty columns
sub_dfs_by_columns = split_df_by_empty_columns(dfs)

# Splitting each sub-DataFrame by empty rows
final_split_dfs = []
for sub_df in sub_dfs_by_columns:
    split_sub_dfs = split_df_by_empty_rows(sub_df)
    final_split_dfs.extend([get_csv_string(x) for x in split_sub_dfs if not x.empty])

dfs.dropna(axis=0, how='all', inplace=True)
dfs.dropna(axis=1, how='all', inplace=True)
fee_columns = ['Disposition Fee', 'Selling Costs']
cashflow_columns = [1,2,3,4,5,6,7,8,9]
ground_truth = dfs[dfs[1].isin(disposition_inputs+cashflow_columns)].iloc[:, :2] # Get only the necessary columns
ground_truth.drop(labels=[16, 17], axis=0, inplace=True) # drop the duplicate Selling and Disposition Costs



  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)


In [3]:
access_token = os.getenv('HF_TOKEN')
llama_api_key = os.getenv('LLAMA_API_KEY')

print('first model load...')
# model_name = "EleutherAI/gpt-neo-125m"
# model_name = "microsoft/Phi-3-mini-128k-instruct" # 128K context window
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # 8K context window
# model_name = "clibrain/mamba-2.8b-instruct-openhermes" # 8K context window
model_name = "Qwen/Qwen2-1.5B-Instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
llm = dspy.HFModel(model=model_name, hf_device_map='auto', token=access_token)
llm.kwargs['max_new_tokens']=30
# llm.kwargs['repetition_penalty']=1.1
llm.kwargs['temperature']=None
llm.kwargs['do_sample']=False
llm.kwargs['top_k']=None
# llm.kwargs['typical_p']=0.9

print('deleting model...')
llm.model=None
gc.collect()
print('reloading model...')

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

llm.model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.bfloat16)


# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["k_proj", "v_proj", "q_proj", "o_proj"], # Mistral param names
#     lora_dropout=0.05,
#     bias="none", #"none", "all", "lora_only"
#     task_type="CAUSAL_LM", 
    
# )

# llm.model = prepare_model_for_kbit_training(llm.model)
# llm.model = get_peft_model(llm.model, config)
# print_trainable_parameters(llm.model)

if model_name == 'mistralai/Mistral-7B-Instruct-v0.3':
    llm.model.generation_config.pad_token_id = llm.tokenizer.eos_token_id
    llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

# dspy.settings.configure(lm=llm)

######## RAG model
# chroma_client = chromadb.PersistentClient(path=CHROMADB_DIR)
# collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
# # text_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=100)

# ids = []
# documents = []
# metadatas = []
# # dfs_str = get_csv_string(dfs)
# # chunks = text_splitter.create_documents([dfs_str], )
# for chunk_no, chunk in enumerate(final_split_dfs):
#     ids.append(f"{chunk_no}")
#     documents.append(chunk)
#     # metadatas.append({"title":})
# if ids:
#     collection.upsert(ids=ids, documents=documents)#, metadatas=metadatas)

# retriever = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
# default_ef = embedding_functions.HuggingFaceEmbeddingFunction(model_name='colbert-ir/colbertv2.0', api_key=access_token)
# default_ef = embedding_functions.DefaultEmbeddingFunction()
# retriever = ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, default_ef, k=3)

# dspy.settings.configure(lm=llm, rm=retriever)

first model load...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


deleting model...
reloading model...


In [5]:
dspy.settings.configure(lm=llm)

In [6]:
access_token = os.getenv('HF_TOKEN')
llama_api_key = os.getenv('LLAMA_API_KEY')
parser = LlamaParse(
    api_key=llama_api_key,
        result_type="text",
        language="en",
        varbose=True
    )

!rm -rf /root/.cache/huggingface/hub/

!pip3 install llama-index-embeddings-text-embeddings-inference

In [7]:
TRY LlamaIndexRM

# model_name = "mistralai/Mistral-7B-Instruct-v0.3" # 32K context window
# model_name = "Qwen/Qwen2-1.5B-Instruct"
# tokenizer_name = model_name
# rm_llm = HuggingFaceLLM(model=dspy_llm.llm, tokenizer_name=tokenizer_name, is_chat_model=True, device_map='auto', max_new_tokens=50, context_window=8000)
# rm_llm = HuggingFaceLLM(model_name=model_name, tokenizer_name=tokenizer_name, is_chat_model=True, device_map='auto', max_new_tokens=50, context_window=8000)
filepath = "/workspace/data/MASTER - PYTHON - SCORING MODEL - MCG MADISON RIDGE DST - v2.0.xlsx"
documents = PandasExcelReader(sheet_name="5 - Disposition Analysis").load_data(filepath)
# documents = parser.load_data("/workspace/data/PPM - MCG MADISON RIDGE DST.pdf")
print("Documents created")

# Settings.llm = rm_llm
Settings.chunk_size = 300
Settings.chunk_overlap = 50

embed_model_name = "BAAI/bge-small-en-v1.5"
# embed_model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

# embed_model.num_workers = 1

# KeywordTableSimpleRetriever
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
index.storage_context.persist(persist_dir="/workspace/data/storage/alpha")
query_engine = index.as_retriever(similarity_top_k=2)

# Settings.embed_model = embed_model

# index.set_index_id("vector_index")
# index.storage_context.persist("/workspace/data/storage")
# storage_context = StorageContext.from_defaults(persist_dir="/workspace/data/storage")
# index = load_index_from_storage(storage_context, index_id="vector_index")
# query_engine = index.as_query_engine(response_mode="tree_summarize")

Documents created
True
True


dspy_llm = DspyLlamaIndexWrapper(rm_llm, model_type='chat', max_new_tokens=30)
dspy.settings.configure(lm=dspy_llm)


In [None]:
# Question: Get the value for Return of Maximum Offering Amount.
# Extracted values: Return of Maximum Offering Amount: 44386706.96773932
# Question: What is the return on maximum offering amount? Please provide a floating point number less than zero.
# Extracted values: Return of Maximum Offering Amount: -77670566.54709445

# Fine Tuning

import sys
sys.setrecursionlimit(10000)

In [8]:
gt_collect = {}
for row,col in ground_truth.iterrows():
    if isinstance(col.values[0], int):
        name = f"Cashflows {col.values[0]}"
    else:
        name = col.values[0]
    value = col.values[1]
    gt_collect[name] = str(value)

In [9]:
import random
# dfs_str = get_csv_string(dfs)
num_rounds = 10
train_data = []
for _ in range(num_rounds):
    # TODO: gradually increase n_samples, random fill in of values in range
    # dfs_aug = randomize_row_values(dfs, ground_truth=ground_truth, n_samples=15)
    # dfs_str = get_csv_string(dfs_aug)
    # dfs_str = get_csv_string(dfs)
    
    for value_to_extract in gt_collect:

        question = f"Extract the value for the variable name '{value_to_extract}'?"
        answer = f"{value_to_extract}: {gt_collect[value_to_extract]}"
        train_data.append(dspy.Example(question=question, answer=answer).with_inputs('question'))
    
random.shuffle(train_data)


input_data = ""
for string in final_split_dfs:
    input_data += string

In [14]:
%load_ext autoreload
%autoreload 2

from train_utils import operators_dict, range_description_json
from models_testing import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict, query_engine=query_engine, num_passages=3)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune, BootstrapFewShot
perc_train = 0.7
num_train = int(len(train_data) * perc_train)
def validate_answer(pred, example, trace=None):
    return example.answer.lower() == pred.answer.lower()
metric = dspy.evaluate.metrics.answer_exact_match
# metric = validate_answer

#Configure model to finetune
config = dict(target=model_name, epochs=10, bf16=True, bsize=1, accumsteps=3, lr=8e-5) #path_prefix=None

#Compile program on BootstrapFinetune
finetune_optimizer = BootstrapFinetune(metric=metric)
# finetune_optimizer = BootstrapFewShot(metric=metric)

finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:num_train], **config)
# finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=train_data[:22])

# finetune_program = spreadsheeet_ananlyst

# #Load program and activate model's parameters in program before evaluation
# ckpt_path = "saved_checkpoint_path_from_finetuning"
# LM = dspy.HFModel(checkpoint=ckpt_path, model=model_name)

# for p in finetune_program.predictors():
#     p.lm = LM
#     p.activated = False



  0%|          | 0/118 [00:00<?, ?it/s][2m2024-07-18T08:40:26.213636Z[0m [[31m[1merror    [0m] [1mFailed to run or to evaluate example Example({'question': "Extract the value for the variable name 'Return of Forecasted Reserves'?", 'answer': 'Return of Forecasted Reserves: 0'}) (input_keys={'question'}) with <function answer_exact_match at 0x72e558b6fac0> due to 'HuggingFaceEmbedding' object has no attribute '_model'.[0m [[0m[1m[34mdspy.teleprompt.bootstrap[0m][0m [36mfilename[0m=[35mbootstrap.py[0m [36mlineno[0m=[35m222[0m
[2m2024-07-18T08:40:26.215127Z[0m [[31m[1merror    [0m] [1mFailed to run or to evaluate example Example({'question': "Extract the value for the variable name 'Cashflows 7'?", 'answer': 'Cashflows 7: 3717508.8156313607'}) (input_keys={'question'}) with <function answer_exact_match at 0x72e558b6fac0> due to 'HuggingFaceEmbedding' object has no attribute '_model'.[0m [[0m[1m[34mdspy.teleprompt.bootstrap[0m][0m [36mfilename[0m=[35mboot

ok 1
ok 2
ok 3
ok 4
ok 4.1
ok 4.2
ok 4.3
Example({'question': "Extract the value for the variable name 'Return of Forecasted Reserves'?", 'answer': 'Return of Forecasted Reserves: 0'}) (input_keys={'question'})
False
ok 1
ok 2
ok 3
ok 4
ok 4.1
ok 4.2
ok 4.3
Example({'question': "Extract the value for the variable name 'Cashflows 7'?", 'answer': 'Cashflows 7: 3717508.8156313607'}) (input_keys={'question'})
False
ok 1
ok 2
ok 3
ok 4
ok 4.1
ok 4.2
ok 4.3
Example({'question': "Extract the value for the variable name 'Cashflows 8'?", 'answer': 'Cashflows 8: 3767450.5411334764'}) (input_keys={'question'})
False
ok 1
ok 2
ok 3
ok 4
ok 4.1
ok 4.2
ok 4.3
Example({'question': "Extract the value for the variable name 'Return of Forecasted Reserves'?", 'answer': 'Return of Forecasted Reserves: 0'}) (input_keys={'question'})
False
ok 1
ok 2
ok 3
ok 4
ok 4.1
ok 4.2
ok 4.3
Example({'question': "Extract the value for the variable name 'CF Y 11'?", 'answer': 'CF Y 11: 3870554.3880146043'}) (input_keys=




AttributeError: 'HuggingFaceEmbedding' object has no attribute '_model'

In [None]:
#Load program and activate model's parameters in program before evaluation
ckpt_path = "/workspace/repos/finetuning_ckpts/4Q6YOISLR4V7G.all/checkpoint-35"
LM = dspy.HFModel(checkpoint=ckpt_path, model=model_name)

for p in finetune_program.predictors():
    p.lm = LM
    p.activated = False

In [None]:
# perc_train = 0.7
# num_train = int(len(train_data) * perc_train)
# metric = dspy.evaluate.metrics.answer_exact_match

scores = []
for x in train_data[num_train:num_train+34]:
    pred = finetune_program(**x.inputs())
    score = metric(x, pred)
    scores.append(score)
np.mean(scores)

saved_checkpoint_path_from_finetuning = '/workspace/repos/finetuning_ckpts/NFAI903XCHAMQ.all/checkpoint-53'
llm.model=None
llm.model=AutoModelForCausalLM.from_pretrained(saved_checkpoint_path_from_finetuning, quantization_config=None, 
                                               trust_remote_code=True, device_map="auto", 
                                               attn_implementation="flash_attention_2",  
                                               torch_dtype=torch.bfloat16)

In [None]:
%load_ext autoreload
%autoreload 2

from train_utils import operators_dict, range_description_json
from models_testing import SpreadSheetAnalyzer
spreadsheeet_ananlyst = SpreadSheetAnalyzer(range_description_json, operators_dict, query_engine=query_engine, num_passages=3)

input_data = ""
for string in final_split_dfs:
    input_data += string

In [None]:
# dfs_str = get_csv_string(dfs)
collection = []
for value_to_extract in gt_collect:
    question = f"Extract the value for the variable name '{value_to_extract}'?"
    print(question)
    pred = spreadsheeet_ananlyst(question, verbose=True)
    collection.append((pred, f"{value_to_extract}: {gt_collect[value_to_extract]}"))


In [None]:
for i in collection:
    print(i[0].answer,"---", i[1])

In [None]:
np.mean([i[0].answer == i[1] for i in collection])

In [None]:
# baseline RAG and extractor only 0.35294117647058826
# baseline RAG, extractor, float and format checks 0.35294117647058826

In [None]:
finetune_program

In [None]:
from dspy.teleprompt.signature_opt_typed import optimize_signature
from dspy.evaluate.metrics import answer_exact_match
from dspy.functional import TypedChainOfThought

compiled_program = optimize_signature(
    student=TypedChainOfThought("question -> answer"),
    evaluator=Evaluate(devset=devset, metric=answer_exact_match, num_threads=10, display_progress=True),
    n_iterations=50,
).program

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune
perc_train = 0.7
num_train = int(len(train_data) * perc_train)
metric = dspy.evaluate.metrics.answer_exact_match

#Compile program on current dspy.settings.lm
fewshot_optimizer = BootstrapFewShotWithRandomSearch(metric=metric, max_bootstrapped_demos=2, num_threads=1)
your_dspy_program_compiled = tp.compile(spreadsheeet_ananlyst, trainset=train_data[:num_train], valset=train_data[num_train:])

#Configure model to finetune
config = dict(target=llm.model, epochs=2, bf16=True, bsize=1, accumsteps=2, lr=5e-5)

#Compile program on BootstrapFinetune
finetune_optimizer = BootstrapFinetune(metric=metric)
finetune_program = finetune_optimizer.compile(spreadsheeet_ananlyst, trainset=some_new_dataset_for_finetuning_model, **config)

finetune_program = spreadsheeet_ananlyst

#Load program and activate model's parameters in program before evaluation
ckpt_path = "saved_checkpoint_path_from_finetuning"
LM = dspy.HFModel(checkpoint=ckpt_path, model=llm.model)

for p in finetune_program.predictors():
    p.lm = LM
    p.activated = False

In [None]:
"""
this code implements a wrapper around the llama_index library to emulate a dspy llm

this allows the llama_index library to be used in the dspy framework since dspy has limited support for LLMs

This code is a slightly modified copy of dspy/dsp/modules/azure_openai.py

The way this works is simply by creating a dummy openai client that wraps around any llama_index LLM object and implements .complete and .chat

tested with python 3.12

dspy==0.1.4
dspy-ai==2.4.9
llama-index==0.10.35
llama-index-llms-openai==0.1.18

"""

import json
import logging
from typing import Any, Literal

from easydict import EasyDict
from llama_index.core.base.llms.types import ChatMessage
from llama_index.core.llms import LLM


def LlamaIndexOpenAIClientWrapper(llm: LLM):
    def chat(messages: list[ChatMessage], **kwargs) -> Any:
        return llm.chat([ChatMessage(**message) for message in messages], **kwargs)

    def complete(prompt: str, **kwargs) -> Any:
        return llm.complete(prompt, **kwargs)

    client = EasyDict(
        {
            'chat': EasyDict({'completions': EasyDict({'create': chat})}),
            'completion': EasyDict({'create': complete}),
            'ChatCompletion': EasyDict({'create': chat}),
            'Completion': EasyDict({'create': complete}),
        }
    )
    return client


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(message)s',
    handlers=[logging.FileHandler('azure_openai_usage.log')],
)

import functools
import json
from typing import Any, Literal

import backoff
import dsp
import openai
from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
from dsp.modules.lm import LM

try:
    OPENAI_LEGACY = int(openai.version.__version__[0]) == 0
except Exception:
    OPENAI_LEGACY = True

try:
    import openai.error
    from openai.openai_object import OpenAIObject

    ERRORS = (
        openai.error.RateLimitError,
        openai.error.ServiceUnavailableError,
        openai.error.APIError,
    )
except Exception:
    ERRORS = (openai.RateLimitError, openai.APIError)
    OpenAIObject = dict


def backoff_hdlr(details):
    """Handler from https://pypi.org/project/backoff/"""
    print(
        'Backing off {wait:0.1f} seconds after {tries} tries ' 'calling function {target} with kwargs ' '{kwargs}'.format(**details),
    )


class DspyLlamaIndexWrapper(LM):
    """Wrapper around Azure's API for OpenAI.

    Args:
        api_base (str): Azure URL endpoint for model calling, often called 'azure_endpoint'.
        api_version (str): Version identifier for API.
        model (str, optional): OpenAI or Azure supported LLM model to use. Defaults to "text-davinci-002".
        api_key (Optional[str], optional): API provider Authentication token. use Defaults to None.
        model_type (Literal["chat", "text"], optional): The type of model that was specified. Mainly to decide the optimal prompting strategy. Defaults to "chat".
        **kwargs: Additional arguments to pass to the API provider.
    """

    def __init__(
        self,
        llm: LLM,
        model_type: Literal['chat', 'text'] = 'chat',
        **kwargs,
    ):
        super().__init__(llm._model)
        self.provider = 'openai'

        self.llm = llm
        self.client = LlamaIndexOpenAIClientWrapper(llm)
        model = llm._model
        self.model_type = model_type

        # if not OPENAI_LEGACY and "model" not in kwargs:
        #     if "deployment_id" in kwargs:
        #         kwargs["model"] = kwargs["deployment_id"]
        #         del kwargs["deployment_id"]

        #     if "api_version" in kwargs:
        #         del kwargs["api_version"]

        if 'model' not in kwargs:
            kwargs['model'] = model

        self.kwargs = {
            'temperature': 0.0,
            'max_tokens': 150,
            'top_p': 1,
            'frequency_penalty': 0,
            'presence_penalty': 0,
            'n': 1,
            **kwargs,
        }  # TODO: add kwargs above for </s>

        self.history: list[dict[str, Any]] = []

    def _openai_client(self):
        # if OPENAI_LEGACY:
        #     return openai

        return self.client

    def log_usage(self, response):
        """Log the total tokens from the Azure OpenAI API response."""
        usage_data = response.get('usage')
        if usage_data:
            total_tokens = usage_data.get('total_tokens')
            logging.info(f'{total_tokens}')

    def basic_request(self, prompt: str, **kwargs):
        raw_kwargs = kwargs

        kwargs = {**self.kwargs, **kwargs}
        if self.model_type == 'chat':
            # caching mechanism requires hashable kwargs
            kwargs['messages'] = [{'role': 'user', 'content': prompt}]
            kwargs = {'stringify_request': json.dumps(kwargs)}
            # response = chat_request(self.client, **kwargs)
            # if OPENAI_LEGACY:
            #     return _cached_gpt3_turbo_request_v2_wrapped(**kwargs)
            # else:
            return v1_chat_request(self.client, **kwargs)

        else:
            kwargs['prompt'] = prompt
            response = self.completions_request(**kwargs)

        history = {
            'prompt': prompt,
            'response': response,
            'kwargs': kwargs,
            'raw_kwargs': raw_kwargs,
        }
        self.history.append(history)

        return response

    @backoff.on_exception(
        backoff.expo,
        ERRORS,
        max_time=1000,
        on_backoff=backoff_hdlr,
    )
    def request(self, prompt: str, **kwargs):
        """Handles retrieval of GPT-3 completions whilst handling rate limiting and caching."""
        if 'model_type' in kwargs:
            del kwargs['model_type']

        return self.basic_request(prompt, **kwargs)

    def _get_choice_text(self, choice: dict[str, Any]) -> str:
        if self.model_type == 'chat':
            return choice['message']['content']
        return choice['text']

    def __call__(
        self,
        prompt: str,
        only_completed: bool = True,
        return_sorted: bool = False,
        **kwargs,
    ) -> list[dict[str, Any]]:
        """Retrieves completions from OpenAI Model.

        Args:
            prompt (str): prompt to send to GPT-3
            only_completed (bool, optional): return only completed responses and ignores completion due to length. Defaults to True.
            return_sorted (bool, optional): sort the completion choices using the returned probabilities. Defaults to False.

        Returns:
            list[dict[str, Any]]: list of completion choices
        """

        assert only_completed, 'for now'
        assert return_sorted is False, 'for now'

        response = self.request(prompt, **kwargs)

        try:
            if dsp.settings.log_openai_usage:
                self.log_usage(response)
        except Exception:
            pass

        choices = response['choices']

        completed_choices = [c for c in choices if c['finish_reason'] != 'length']

        if only_completed and len(completed_choices):
            choices = completed_choices

        completions = [self._get_choice_text(c) for c in choices]
        if return_sorted and kwargs.get('n', 1) > 1:
            scored_completions = []

            for c in choices:
                tokens, logprobs = (
                    c['logprobs']['tokens'],
                    c['logprobs']['token_logprobs'],
                )

                if '<|endoftext|>' in tokens:
                    index = tokens.index('<|endoftext|>') + 1
                    tokens, logprobs = tokens[:index], logprobs[:index]

                avglog = sum(logprobs) / len(logprobs)
                scored_completions.append((avglog, self._get_choice_text(c)))

            scored_completions = sorted(scored_completions, reverse=True)
            completions = [c for _, c in scored_completions]

        return completions

    def completions_request(self, **kwargs):
        # if OPENAI_LEGACY:
        #     return cached_gpt3_request_v2_wrapped(**kwargs)
        return v1_completions_request(self.client, **kwargs)


def v1_chat_request(client, **kwargs):
    @functools.lru_cache(maxsize=None if cache_turn_on else 0)
    @NotebookCacheMemory.cache
    def v1_cached_gpt3_turbo_request_v2_wrapped(**kwargs):
        @CacheMemory.cache
        def v1_cached_gpt3_turbo_request_v2(**kwargs):
            if 'stringify_request' in kwargs:
                kwargs = json.loads(kwargs['stringify_request'])
            return client.chat.completions.create(**kwargs)

        return v1_cached_gpt3_turbo_request_v2(**kwargs)

    response = v1_cached_gpt3_turbo_request_v2_wrapped(**kwargs)

    try:
        response = response.model_dump()
    except Exception:
        response = response.raw
        response['choices'] = [json.loads(x.json()) for x in response['choices']]
        response['usage'] = json.loads(response['usage'].json())
    return response


def v1_completions_request(client, **kwargs):
    @functools.lru_cache(maxsize=None if cache_turn_on else 0)
    @NotebookCacheMemory.cache
    def v1_cached_gpt3_request_v2_wrapped(**kwargs):
        @CacheMemory.cache
        def v1_cached_gpt3_request_v2(**kwargs):
            return client.completions.create(**kwargs)

        return v1_cached_gpt3_request_v2(**kwargs)

    return v1_cached_gpt3_request_v2_wrapped(**kwargs).model_dump()


## ======== test =========

# if __name__ == '__main__':
#     print('Testing DspyLlamaIndexWrapper')
#     import os

#     import dspy
#     from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
#     from llama_index.llms.openai import OpenAI

#     llm = OpenAI(api_key=os.environ['OPENAI_API_KEY'], model='gpt-3.5-turbo')
#     dspy_llm = DspyLlamaIndexWrapper(llm)

#     # Load math questions from the GSM8K dataset.
#     gsm8k = GSM8K()
#     gsm8k_trainset, gsm8k_devset = gsm8k.train[:10], gsm8k.dev[:10]

#     class CoT(dspy.Module):
#         def __init__(self):
#             super().__init__()
#             self.prog = dspy.ChainOfThought('question -> answer')

#         def forward(self, question):
#             response = self.prog(question=question)
#             return response

#     ##

#     dspy.settings.configure(lm=dspy_llm)

#     from dspy.teleprompt import BootstrapFewShot

#     # Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
#     config = dict(max_bootstrapped_demos=4, max_labeled_demos=4)

#     # Optimize! Use the `gsm8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
#     teleprompter = BootstrapFewShot(metric=gsm8k_metric, **config)
#     optimized_cot = teleprompter.compile(CoT(), trainset=gsm8k_trainset)
#     print(f'{optimized_cot=}')

In [None]:
"""
this code implements a wrapper around the llama_index library to emulate a dspy llm

this allows the llama_index library to be used in the dspy framework since dspy has limited support for LLMs

This code is a slightly modified copy of dspy/dsp/modules/azure_openai.py

The way this works is simply by creating a dummy openai client that wraps around any llama_index LLM object and implements .complete and .chat

tested with python 3.12

dspy==0.1.4
dspy-ai==2.4.9
llama-index==0.10.35
llama-index-llms-openai==0.1.18

"""

import json
import logging
from typing import Any, Literal

from easydict import EasyDict
from llama_index.core.base.llms.types import ChatMessage
from llama_index.core.llms import LLM


def LlamaIndexOpenAIClientWrapper(llm: LLM):
    def chat(messages: list[ChatMessage], **kwargs) -> Any:
        return llm.chat([ChatMessage(**message) for message in messages], **kwargs)

    def complete(prompt: str, **kwargs) -> Any:
        return llm.complete(prompt, **kwargs)

    client = EasyDict(
        {
            'chat': EasyDict({'completions': EasyDict({'create': chat})}),
            'completion': EasyDict({'create': complete}),
            'ChatCompletion': EasyDict({'create': chat}),
            'Completion': EasyDict({'create': complete}),
        }
    )
    return client


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(message)s',
    handlers=[logging.FileHandler('azure_openai_usage.log')],
)

import functools
import json
from typing import Any, Literal

import backoff
import dsp
import openai
from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
from dsp.modules.lm import LM

try:
    OPENAI_LEGACY = int(openai.version.__version__[0]) == 0
except Exception:
    OPENAI_LEGACY = True

try:
    import openai.error
    from openai.openai_object import OpenAIObject

    ERRORS = (
        openai.error.RateLimitError,
        openai.error.ServiceUnavailableError,
        openai.error.APIError,
    )
except Exception:
    ERRORS = (openai.RateLimitError, openai.APIError)
    OpenAIObject = dict


def backoff_hdlr(details):
    """Handler from https://pypi.org/project/backoff/"""
    print(
        'Backing off {wait:0.1f} seconds after {tries} tries ' 'calling function {target} with kwargs ' '{kwargs}'.format(**details),
    )


class DspyLlamaIndexWrapper(LM):
    """Wrapper around Azure's API for OpenAI.

    Args:
        api_base (str): Azure URL endpoint for model calling, often called 'azure_endpoint'.
        api_version (str): Version identifier for API.
        model (str, optional): OpenAI or Azure supported LLM model to use. Defaults to "text-davinci-002".
        api_key (Optional[str], optional): API provider Authentication token. use Defaults to None.
        model_type (Literal["chat", "text"], optional): The type of model that was specified. Mainly to decide the optimal prompting strategy. Defaults to "chat".
        **kwargs: Additional arguments to pass to the API provider.
    """

    def __init__(
        self,
        llm: LLM,
        model_type: Literal['chat', 'text'] = 'chat',
        **kwargs,
    ):
        super().__init__(llm.model)
        self.provider = 'openai'

        self.llm = llm
        self.client = LlamaIndexOpenAIClientWrapper(llm)
        model = llm.model
        self.model_type = model_type

        # if not OPENAI_LEGACY and "model" not in kwargs:
        #     if "deployment_id" in kwargs:
        #         kwargs["model"] = kwargs["deployment_id"]
        #         del kwargs["deployment_id"]

        #     if "api_version" in kwargs:
        #         del kwargs["api_version"]

        if 'model' not in kwargs:
            kwargs['model'] = model

        self.kwargs = {
            'temperature': 0.0,
            'max_tokens': 150,
            'top_p': 1,
            'frequency_penalty': 0,
            'presence_penalty': 0,
            'n': 1,
            **kwargs,
        }  # TODO: add kwargs above for </s>

        self.history: list[dict[str, Any]] = []

    def _openai_client(self):
        # if OPENAI_LEGACY:
        #     return openai

        return self.client

    def log_usage(self, response):
        """Log the total tokens from the Azure OpenAI API response."""
        usage_data = response.get('usage')
        if usage_data:
            total_tokens = usage_data.get('total_tokens')
            logging.info(f'{total_tokens}')

    def basic_request(self, prompt: str, **kwargs):
        raw_kwargs = kwargs

        kwargs = {**self.kwargs, **kwargs}
        if self.model_type == 'chat':
            # caching mechanism requires hashable kwargs
            kwargs['messages'] = [{'role': 'user', 'content': prompt}]
            kwargs = {'stringify_request': json.dumps(kwargs)}
            # response = chat_request(self.client, **kwargs)
            # if OPENAI_LEGACY:
            #     return _cached_gpt3_turbo_request_v2_wrapped(**kwargs)
            # else:
            return v1_chat_request(self.client, **kwargs)

        else:
            kwargs['prompt'] = prompt
            response = self.completions_request(**kwargs)

        history = {
            'prompt': prompt,
            'response': response,
            'kwargs': kwargs,
            'raw_kwargs': raw_kwargs,
        }
        self.history.append(history)

        return response

    @backoff.on_exception(
        backoff.expo,
        ERRORS,
        max_time=1000,
        on_backoff=backoff_hdlr,
    )
    def request(self, prompt: str, **kwargs):
        """Handles retrieval of GPT-3 completions whilst handling rate limiting and caching."""
        if 'model_type' in kwargs:
            del kwargs['model_type']

        return self.basic_request(prompt, **kwargs)

    def _get_choice_text(self, choice: dict[str, Any]) -> str:
        if self.model_type == 'chat':
            return choice['message']['content']
        return choice['text']

    def __call__(
        self,
        prompt: str,
        only_completed: bool = True,
        return_sorted: bool = False,
        **kwargs,
    ) -> list[dict[str, Any]]:
        """Retrieves completions from OpenAI Model.

        Args:
            prompt (str): prompt to send to GPT-3
            only_completed (bool, optional): return only completed responses and ignores completion due to length. Defaults to True.
            return_sorted (bool, optional): sort the completion choices using the returned probabilities. Defaults to False.

        Returns:
            list[dict[str, Any]]: list of completion choices
        """

        assert only_completed, 'for now'
        assert return_sorted is False, 'for now'

        response = self.request(prompt, **kwargs)

        try:
            if dsp.settings.log_openai_usage:
                self.log_usage(response)
        except Exception:
            pass

        choices = response['choices']

        completed_choices = [c for c in choices if c['finish_reason'] != 'length']

        if only_completed and len(completed_choices):
            choices = completed_choices

        completions = [self._get_choice_text(c) for c in choices]
        if return_sorted and kwargs.get('n', 1) > 1:
            scored_completions = []

            for c in choices:
                tokens, logprobs = (
                    c['logprobs']['tokens'],
                    c['logprobs']['token_logprobs'],
                )

                if '<|endoftext|>' in tokens:
                    index = tokens.index('<|endoftext|>') + 1
                    tokens, logprobs = tokens[:index], logprobs[:index]

                avglog = sum(logprobs) / len(logprobs)
                scored_completions.append((avglog, self._get_choice_text(c)))

            scored_completions = sorted(scored_completions, reverse=True)
            completions = [c for _, c in scored_completions]

        return completions

    def completions_request(self, **kwargs):
        # if OPENAI_LEGACY:
        #     return cached_gpt3_request_v2_wrapped(**kwargs)
        return v1_completions_request(self.client, **kwargs)


def v1_chat_request(client, **kwargs):
    @functools.lru_cache(maxsize=None if cache_turn_on else 0)
    @NotebookCacheMemory.cache
    def v1_cached_gpt3_turbo_request_v2_wrapped(**kwargs):
        @CacheMemory.cache
        def v1_cached_gpt3_turbo_request_v2(**kwargs):
            if 'stringify_request' in kwargs:
                kwargs = json.loads(kwargs['stringify_request'])
            return client.chat.completions.create(**kwargs)

        return v1_cached_gpt3_turbo_request_v2(**kwargs)

    response = v1_cached_gpt3_turbo_request_v2_wrapped(**kwargs)

    try:
        response = response.model_dump()
    except Exception:
        response = response.raw
        response['choices'] = [json.loads(x.json()) for x in response['choices']]
        response['usage'] = json.loads(response['usage'].json())
    return response


def v1_completions_request(client, **kwargs):
    @functools.lru_cache(maxsize=None if cache_turn_on else 0)
    @NotebookCacheMemory.cache
    def v1_cached_gpt3_request_v2_wrapped(**kwargs):
        @CacheMemory.cache
        def v1_cached_gpt3_request_v2(**kwargs):
            return client.completions.create(**kwargs)

        return v1_cached_gpt3_request_v2(**kwargs)

    return v1_cached_gpt3_request_v2_wrapped(**kwargs).model_dump()


## ======== test =========

# if __name__ == '__main__':
#     print('Testing DspyLlamaIndexWrapper')
#     import os

#     import dspy
#     from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
#     from llama_index.llms.openai import OpenAI

#     llm = OpenAI(api_key=os.environ['OPENAI_API_KEY'], model='gpt-3.5-turbo')
#     dspy_llm = DspyLlamaIndexWrapper(llm)

#     # Load math questions from the GSM8K dataset.
#     gsm8k = GSM8K()
#     gsm8k_trainset, gsm8k_devset = gsm8k.train[:10], gsm8k.dev[:10]

#     class CoT(dspy.Module):
#         def __init__(self):
#             super().__init__()
#             self.prog = dspy.ChainOfThought('question -> answer')

#         def forward(self, question):
#             response = self.prog(question=question)
#             return response

#     ##

#     dspy.settings.configure(lm=dspy_llm)

#     from dspy.teleprompt import BootstrapFewShot

#     # Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
#     config = dict(max_bootstrapped_demos=4, max_labeled_demos=4)

#     # Optimize! Use the `gsm8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
#     teleprompter = BootstrapFewShot(metric=gsm8k_metric, **config)
#     optimized_cot = teleprompter.compile(CoT(), trainset=gsm8k_trainset)
#     print(f'{optimized_cot=}')