In [None]:
!pip install groq
!pip install langchain-huggingface

Collecting groq
  Downloading groq-0.15.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.15.0-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.15.0
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [None]:
import random
import pandas as pd
from groq import Groq
from google.colab import userdata
import math
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate

In [None]:
groq_api_key=userdata.get("HF_API_GROQ")
api_key=userdata.get("HF_API")

In [None]:
# Read the JSON file with multiple objects (line-delimited JSON)
rgb_data = pd.read_json('/content/en_refine.json', lines=True)

In [None]:
rgb_data.shape

(300, 5)

In [None]:
sys_content = "You are an accurate and reliable AI assistant that can answer questions with the help of external documents. Please note that external documents may contain noisy or factually incorrect information. If the information in the document contains the correct answer, you will give an accurate answer. If the information in the document does not contain the answer, you will generate ’I can not answer the question because of the insufficient information in documents.‘. If there are inconsistencies with the facts in some of the documents, please generate the response 'There are factual errors in the provided documents.' and provide the correct answer."

In [None]:
def get_samples(positive_samples, negative_samples, total_samples, negative_percentage):
    # Calculate the number of negative and positive samples to pick
    neg_count = int((negative_percentage / 100) * total_samples)
    pos_count = total_samples - neg_count

    # Ensure we don't exceed available samples
    pos_count = min(pos_count, len(positive_samples))
    neg_count = min(neg_count, len(negative_samples))

    # Randomly sample from positive and negative lists
    selected_positives = random.sample(positive_samples, pos_count)
    selected_negatives = random.sample(negative_samples, neg_count)
    print("Positive")
    print(selected_positives)
    print("Negative")
    print(selected_negatives)
    # Combine and shuffle results
    final_samples = selected_positives + selected_negatives
    random.shuffle(final_samples)

    return final_samples

In [None]:
def checkanswer(prediction, ground_truth):
    prediction = prediction.lower()
    if type(ground_truth) is not list:
        ground_truth = [ground_truth]
    labels = []
    for instance in ground_truth:
        flag = True
        if type(instance)  == list:
            flag = False
            instance = [i.lower() for i in instance]
            for i in instance:
                if i in prediction:
                    flag = True
                    break
        else:
            instance = instance.lower()
            if instance not in prediction:
                flag = False
        labels.append(int(flag))
    return labels

In [None]:
prompt_template = "Document:\n{DOCS} \n\nQuestion:\n{QUERY}"

In [None]:
def get_groq_llm_response(prompt,documents,question,eval_model,temp):

    groq_client = Groq(
        api_key=groq_api_key, # replace with your actual key
    )
    # Prepare the input prompt
    input_prompt = prompt.format(
        DOCS=documents,
        QUERY=question
    )

    try:

        chat_completion = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",  #  #whisper-large-v3-turbo,llama-3.3-70b-versatile
            messages=[
                {"role": "system", "content": sys_content},
                {"role": "user", "content": input_prompt}
            ],
            temperature=temp,
            # passage_num=pass_num,
            timeout=120  # Timeout in seconds
            # noise_rate=noise_rte
        )

        # Extract the evaluation response from the API response
        llm_response = chat_completion.choices[0].message.content
        return llm_response

    except Exception as e:
        print(f"Error invoking GROQ model for evaluation: {e}")
        return None

In [None]:
def get_compare_val_answer(llm_answer,ground_truth):
    if 'insufficient information' in llm_answer:
        labels = [-1]
        print(llm_answer)
        print(ground_truth)
    else:
        labels = checkanswer(llm_answer, ground_truth)

    factlabel = 0

    if 'factual errors' in llm_answer:
        factlabel = 1

    return labels,llm_answer,factlabel

In [None]:
# pick any 10 samples
random_rows = rgb_data.sample(n=10, random_state=None)

In [None]:
tt = 0  # Initialize the tt counter
results = []  # List to store results for calculating the ratio later
groq_model = "llama-3.3-70b-versatile"   #llama-3.3-70b-versatile,mixtral-8x7b-32768
noise_rte = 1.0
temp = 0.7
for index, row in random_rows.iterrows():
    # Generate samples based on positive, negative data
    print(index)

    query, ans, docs = processdata(row, noise_rte, 10, "en_refine.json", correct_rate = 0)

    # Get the LLM answer for the current query
    llm_answer = get_groq_llm_response(prompt_template, docs, row["query"], groq_model, temp)

    if llm_answer is not None:
        # Compare the LLM answer with the ground truth
        labels, llm_answer, factlabel = get_compare_val_answer(llm_answer, row["answer"])
        print(f"Compare {labels}")

        # Append the result to the list for ratio calculation
        results.append(labels)

        # Implement the tt logic
        if noise_rte == 1 and labels[0] == -1:
            tt += 1
        elif 0 not in labels and 1 in labels:
            tt += 1

        # Print the results for the current iteration
        # print(labels, llm_answer, factlabel)

# Calculate and print the ratio tt / len(results)
if len(results) > 0:
    print(f"TT Ratio: for noise ratio of {noise_rte} for model {groq_model}   is {tt / random_rows.shape[0]}")
else:
    print("No valid results to calculate the TT ratio.")


272
positive []
negative ['In August 2015, Apple revamped the online storefront, removing the dedicated "Store" tab and making the entire website a retail experience.[66][67]  Jobs believed the Apple retail program needed to fundamentally change the relationship to the customer, and provide more control over the presentation of Apple products and the Apple brand message. Jobs recognized the limitations of third-party retailing and began investigating options to change the model.[4]  In 1999, Jobs personally recruited Millard Drexler, former CEO of Gap Inc., to serve on Apple\'s board of directors.[4][68][69] In 2000, Jobs hired Ron Johnson from Target. The retail and development teams headed by Allen Moyer from The Walt Disney Company then began a series of mock-ups for the Apple Store inside a warehouse near the company\'s Cupertino headquarters.[4]  On May 15, 2001, Jobs hosted a press event at Apple\'s first store, located at the Tysons Corner Center mall in Tysons, Virginia, near W

In [None]:
def processdata(instance, noise_rate, passage_num, filename, correct_rate = 0):
    query = instance['query']
    ans = instance['answer']
    docs = []

    neg_num = math.ceil(passage_num * noise_rate)
    pos_num = passage_num - neg_num

    if '_int' in filename:
        for i in instance['positive']:
            random.shuffle(i)
        print(len(instance['positive']))
        docs = [i[0] for i in instance['positive']]
        if len(docs) < pos_num:
            maxnum = max([len(i) for i in instance['positive']])
            for i in range(1,maxnum):
                for j in instance['positive']:
                    if len(j) > i:
                        docs.append(j[i])
                        if len(docs) == pos_num:
                            break
                if len(docs) == pos_num:
                    break
        neg_num = passage_num - len(docs)
        if neg_num > 0:
            negative = instance['negative'][:neg_num]
            docs += negative
    elif '_fact' in filename:
        correct_num = math.ceil(passage_num * correct_rate)
        pos_num = passage_num - neg_num - correct_num
        indexs = list(range(len(instance['positive'])))
        selected = random.sample(indexs,min(len(indexs),pos_num))
        docs = [instance['positive_wrong'][i] for i in selected]
        remain = [i for i in indexs if i not in selected]
        if correct_num > 0 and len(remain) > 0:
            docs += [instance['positive'][i] for i in random.sample(remain,min(len(remain),correct_num))]
        if neg_num > 0:
            docs += instance['negative'][:neg_num]
    else:
        if noise_rate == 1:
            neg_num = passage_num
            pos_num = 0
        else:
            if neg_num > len(instance['negative']):
                neg_num = len(instance['negative'])
                pos_num = passage_num - neg_num
            elif pos_num > len(instance['positive']):
                pos_num = len(instance['positive'])
                neg_num = passage_num - pos_num


        positive = instance['positive'][:pos_num]
        # print(len(positive))
        negative = instance['negative'][:neg_num]
        # print(len(negative))
        docs = positive + negative
        # print(len(docs))



    random.shuffle(docs)

    return query, ans, docs

In [None]:
def get_groq_llm_response(prompt,documents,question,eval_model,temp):

    groq_client = Groq(
        api_key=groq_api_key, # replace with your actual key
    )
    # Prepare the input prompt
    input_prompt = prompt.format(
        DOCS=documents,
        QUERY=question
    )

    try:

        chat_completion = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",  #  #whisper-large-v3-turbo,llama-3.3-70b-versatile
            messages=[
                {"role": "system", "content": sys_content},
                {"role": "user", "content": input_prompt}
            ],
            temperature=temp,
            # passage_num=pass_num,
            timeout=120  # Timeout in seconds
            # noise_rate=noise_rte
        )

        # Extract the evaluation response from the API response
        llm_response = chat_completion.choices[0].message.content
        return llm_response

    except Exception as e:
        print(f"Error invoking GROQ model for evaluation: {e}")
        return None

In [None]:
def get_hugging_face_llm_response(documents,question,eval_model,temp):

     # Set up the LLM endpoint
    llm_retrieval = HuggingFaceEndpoint(
        repo_id=eval_model,
        temperature=0.7,
        huggingfacehub_api_token=api_key,
    )

    # Combine the retrieval template and the LLM
    llm_retrieval_chain = response_prompt() | llm_retrieval

    # Prepare input for the chain
    input_retrieval = {
        "DOCS": documents,
        "QUERY": question
    }

    # Invoke the chain to generate a response
    try:
        llm_response = llm_retrieval_chain.invoke(input_retrieval)
        return llm_response
    except Exception as e:
        print(f"Error invoking LLM retrieval chain: {e}")
        return None


In [None]:
def response_prompt():
    retrieval_prompt= sys_content +" here is the Document:\n{DOCS} \n\n and the Question is :\n{QUERY}"
    retrieval_prompt_template = PromptTemplate.from_template(retrieval_prompt)
    return retrieval_prompt_template

In [None]:
# pick any 10 samples
random_rows_hugging = rgb_data.sample(n=1, random_state=None)

In [None]:
tt = 0  # Initialize the tt counter
results = []  # List to store results for calculating the ratio later
response_model="mistralai/Mistral-7B-Instruct-v0.3"  #llama-3.3-70b-versatile,mixtral-8x7b-32768
noise_rte = 1.0
temp = 0.5
for index, row in rgb_data.iterrows():
    # Generate samples based on positive, negative data
    print(index)

    query, ans, docs = processdata(row, noise_rte, 20, "en_refine.json", correct_rate = 0)
    # print(docs)

    # print(f"ground truth  is --------{row['answer']}")
    # Get the LLM answer for the current query
    llm_answer = get_hugging_face_llm_response(docs, row["query"], "Qwen/QwQ-32B-Preview", temp)
    # print(f"llm_answer is ---------{llm_answer}")

    if llm_answer is not None:
        # Compare the LLM answer with the ground truth
        labels, llm_answer, factlabel = get_compare_val_answer(llm_answer, row["answer"])
        print(f"labels {labels}")

        # Append the result to the list for ratio calculation
        results.append(labels)

        # Implement the tt logic
        if noise_rte == 1 and labels[0] == -1:
            tt += 1
        elif 0 not in labels and 1 in labels:
            tt += 1

        # Print the results for the current iteration
        # print(labels, llm_answer, factlabel)

# Calculate and print the ratio tt / len(results)



0
labels [0]
1
labels [0]
2
labels [0]
3
labels [0]
4
labels [0]
5
labels [0]
6
labels [0]
7
labels [0]
8
labels [0]
9
labels [0]
10
labels [1]
11
labels [1]
12
labels [0]
13
 and what is the price for a 30-second spot during the Super Bowl 2023?

I can't answer the question because of insufficient information in the documents.
['$6.5 million']
labels [-1]
14
 I cannot answer the question because of insufficient information in the documents.
[['May 26', 'May 26', 'May. 26', '26 May', '26 May', '26 May.']]
labels [-1]
15
 I cannot answer the question because of insufficient information in the documents. The provided documents do not contain information about the 2022 recipient of the Ivan Allen Jr. Prize for Social Courage. The most recent recipient mentioned is Christiane Amanpour for 2023, and previous recipients include Sam Nunn, Anthony Fauci, Jimmy and Rosalynn Carter, John Lewis, and others, but there is no specific mention of a 2022 recipient.
['Lawrence Williams', 'Ralph Long Jr

In [None]:
if len(results) > 0:
    print(f"TT Ratio: for noise ratio of {noise_rte} for model {response_model}   is {tt / rgb_data.shape[0]}")
else:
    print("No valid results to calculate the TT ratio.")

TT Ratio: for noise ratio of 1.0 for model mistralai/Mistral-7B-Instruct-v0.3   is 0.5


In [None]:
tt

265

In [None]:
rgb_data.shape[0]

300

In [None]:
results

[[0]]