# LLM and RAG testing 

In [12]:
import os
import pathlib

from openai import OpenAI

with open("../.env", "r") as f:
    os.environ.update(
        dict(line.strip().split("=") for line in f if not line.startswith("#"))
    )

client = OpenAI()


message = "Hello, how are you today?"
message = "".join("\u00a0" if i % 2 else c for i, c in enumerate(message))


completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.",
        },
        {"role": "user", "content": message},
    ],
)
print(completion.choices[0].message.content)
if "\u00a0" in completion.choices[0].message.content:
    print("Success")
else:
    print("Failure")


Hello! It looks like you might have had a bit of a keyboard mishap. How can I assist you today?
Failure


In [4]:
from datasets import load_dataset

corpus = load_dataset("MarkrAI/AutoRAG-evaluation-2024-LLM-paper-v1", "corpus")
qa = load_dataset("MarkrAI/AutoRAG-evaluation-2024-LLM-paper-v1", "qa")

print(len(corpus["train"]), len(qa["train"]))
print(corpus["train"][0])
print(corpus["train"][0]["contents"])

8576 520
{'doc_id': '6f86094c-47fe-43de-a77a-e8c34c69c997', 'contents': "# Rag-Driver: Generalisable Driving Explanations With Retrieval-Augmented In-Context Learning In Multi-Modal Large Language Model\n\nJianhao Yuan1, Shuyang Sun1, Daniel Omeiza1, Bo Zhao2, Paul Newman1, Lars Kunze1, Matthew Gadd1\n1 University of Oxford 2 Beijing Academy of Artificial Intelligence\n{jianhaoyuan,kevinsun,daniel,pnewman,lars,mattgadd}@robots.ox.ac.uk  \nAbstract—Robots powered by 'blackbox' models need to provide\nhuman-understandable explanations which we can trust. Hence,\nexplainability plays a critical role in trustworthy autonomous\ndecision-making to foster transparency and acceptance among\nend users, especially in complex autonomous driving. Recent\nadvancements in Multi-Modal Large Language models (MLLMs)\nhave shown promising potential in enhancing the explainability\nas a driving agent by producing control predictions along with\nnatural language explanations. However, severe data scarcity

In [5]:
res = client.embeddings.create(
    input=corpus["train"][0]["contents"],
    model="text-embedding-3-small",
)
print(res.data[0].embedding)

[0.021245203912258148, 0.013545427471399307, 0.0005057807429693639, -0.01877303421497345, 0.007880039513111115, -0.03543442487716675, -0.004818798508495092, 0.0417693592607975, -0.023987766355276108, 0.034224092960357666, 0.024322539567947388, -0.06886021047830582, -0.023910511285066605, 0.01507765706628561, -0.000727084930986166, -0.03764907643198967, 0.00640896987169981, 0.0001486761902924627, -0.003553743241354823, 0.01944258064031601, 0.04447329416871071, -0.016210734844207764, 0.01883741468191147, 0.004432522226125002, -0.006849968805909157, -0.04684245586395264, 0.04547761380672455, 0.024876203387975693, 0.00895517598837614, -0.031133880838751793, 0.03365755453705788, -0.033451538532972336, -0.037365809082984924, -0.013107647188007832, -0.028893478214740753, 0.020125003531575203, 0.01703479140996933, 0.009064620360732079, -0.0200091190636158, -0.0013745003379881382, 0.013905951753258705, -0.03654175251722336, -0.0005246921791695058, 0.02760588936507702, 0.019635718315839767, 0.04

In [6]:
from typing import Iterable

from chromadb import EphemeralClient
from chromadb.config import Settings
from chromadb.types import Collection
from chromadb.utils.embedding_functions.openai_embedding_function import (
    OpenAIEmbeddingFunction,
)

try:
    api_key = os.environ["OPENAI_API_KEY"]
except KeyError:
    raise ValueError("No API key.")

chroma_client = EphemeralClient(Settings(anonymized_telemetry=False))
embedding_fn = OpenAIEmbeddingFunction(api_key, model_name="text-embedding-3-small")

papers_collection = chroma_client.create_collection(
    name="papers", embedding_function=embedding_fn, get_or_create=True
)


def embed(
    coll: Collection, docs: Iterable[str], ids: Iterable[str], batch_size: int = 2048
) -> None:
    """Embeds a list of documents and update or insert them into a collection."""
    for i in range(0, len(docs), batch_size):
        coll.upsert(documents=docs[i : i + batch_size], ids=ids[i : i + batch_size])

In [7]:
embed(papers_collection, corpus["train"]["contents"], corpus["train"]["doc_id"])

In [8]:
papers_collection.query(query_texts=["Hello, how are you today?"])

{'ids': [['c1c0ecba-3d50-4cd3-abac-9487f1b7fc44',
   '639765ad-0444-4d65-8212-0270681d2296',
   'fd44285b-cf3c-4c3d-be52-a1930de88f89',
   '349ed82f-e133-404b-b1f6-3bbf1000bb68',
   'f88bfdf4-e8aa-4faf-a3cc-e2976a8fe93d',
   'cb798935-71bd-4f9d-8078-0443913ec8c5',
   'f3a2805a-2678-454b-8b64-e29763cf0a74',
   '50dddb08-9526-40fc-baae-0c3afcd67b8a',
   'a256607e-d318-4ac1-8a43-24184ce5b743',
   '5fb6b0d9-e792-46a8-923c-918330407e61']],
 'embeddings': None,
 'documents': [['## Unhealthy Conversation Prompts Prompt For Q-0S And Lm Scenarios\n### Text:\n\n<text>',
   '## Unhealthy Conversation Prompts Prompt For Q-0S And Lm Scenarios\n### Text:\n\n<text>',
   '## Unhealthy Conversation Prompts Prompt For Q-0S And Lm Scenarios\n### Text:\n\n<text>',
   '## Unhealthy Conversation Prompts Prompt For Q-0S And Lm Scenarios\n### Text:\n\n<text>',
   '## Unhealthy Conversation Prompts Prompt For Q-0S And Lm Scenarios\n### Text:\n\n<text>',
   '## Unhealthy Conversation Prompts Prompt For Q-0S And

In [9]:
for i, q in enumerate(qa["train"][:100]["query"]):
    print(i, q)

0 Does the Turing Test assess a machine's ability to exhibit intelligent behavior equivalent to that of a human?
1 Is the oa_temp or the zone_occ the most impactful feature according to the Shapley values?
2 What is the performance percentage increase observed in the navigational prompt suffix attack scenario when using VELMA-FT?
3 What are essential components of evaluating large language models (LLMs)?
4 How many errors were there in the Inference phase as per the document?
5 What is the meaning of PLP-former in the context of this text?
6 How does the Qmsum value vary between the first and the third entries?
7 What is the purpose of using the torch.einsum function in the provided text?
8 What is the result of applying the PAL attack against GPT-3.5-Turbo?
9 What is the process to deceive the test function as described?
10 What leads to a greater decrease in performance in deploying LLMs/VLMs in robotics according to the experiment?
11 Why has the accuracy of the models not reached 1

In [10]:
model = "gpt-3.5-turbo"

question = qa["train"][88]["query"]
answer = qa["train"][88]["generation_gt"][0]

context = "\n\n".join(
    "\n".join(doc)
    for doc in papers_collection.query(query_texts=[question], n_results=1)["documents"]
)
message = question


# message = f"""Question:
# {question}

# Context:
# {context}"""
print(f"Question: {question}")
print(f"Dateset answer: {answer}")

completion_rag = client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant, answer in one sentence.",
        },
        {"role": "assistant", "content": context},
        {"role": "user", "content": message},
    ],
)

print(f"Model answer with context: {completion_rag.choices[0].message.content}")

completion_norag = client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant, answer in one sentence.",
        },
        {"role": "user", "content": message},
    ],
)

print(f"Model answer without context: {completion_norag.choices[0].message.content}")


Question: What makes the GRACE framework able to perform generative cross-modal retrieval?
Dateset answer: GRACE assigns unique identifiers to images and comprises two training steps: learning to memorize the associations between visual content and their identifiers, and learning to retrieve the identifier of a relevant image given a textual query.
Model answer with context: GRACE assigns unique identifiers to images in the dataset, enabling the model to learn mappings from images to identifiers for effective visual memory and generation, supporting generative cross-modal retrieval.
Model answer without context: The GRACE framework combines generative modeling and discriminative learning to effectively perform cross-modal retrieval tasks.


In [11]:
print(context)

# Generative Cross-Modal Retrieval: Memorizing Images In Multimodal Language Models For Retrieval And Beyond
## 3.2 Overview

In this work, we present GRACE, a novel generative cross-modal retrieval framework, as illustrated in Figure 2. As previously discussed, addressing the challenges of visual memory and visual recall is essential for generative cross-modal retrieval. Towards this objective, GRACE assigns **unique** identifiers to images in the dataset DI. This strategy allows the model to learn mappings from images to their respective identifiers, facilitating visual memory. Moreover, the model could generate identifiers as retrieval results rather than generate real images. Representing images as identifiers underpins our training scheme, which is divided into two core steps: "learning to memorize" and "learning to retrieve". The two training steps are designed to enable the model to effectively memorize images in parameters and subsequently learn to recall them in response to te

In [None]:
from waterllmarks.pipeline import QueryAugmentation, ExecConfig
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

qa = QueryAugmentation(
    config=ExecConfig(
        llm=llm,
    )
)

qa(
    {
        "query": "What is the capital of France?",
    }
)
