#### This is file is used to generate German language data for testing. And for running the tests for Noise Robustness, Negative Rejection and Information Integration.

# Vorbereitung:
## RAG Objekt initialisieren, mit dem Daten für die Tests generiert werden:
Es wird ein RAG Objekt initialisiert, das dafür benötigt wird, die Dokumente, auf denen zu Testen gewünscht ist, als Vector-Embeddings zu speichern.

In [1]:
# !pip uninstall -y tensorboard tensorboard-data-server tensorboard-plugin-wit tensorflow tensorflow-estimator tensorflow-io-gcs-filesystem
# !pip uninstall -y nvidia-cublas-cu11 nvidia-cublas-cu12 nvidia-cuda-cupti-cu11 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu11 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu11 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu11 nvidia-cudnn-cu12 nvidia-cufft-cu11 nvidia-cufft-cu12 nvidia-curand-cu11 nvidia-curand-cu12 nvidia-cusolver-cu11 nvidia-cusolver-cu12 nvidia-cusparse-cu11 nvidia-cusparse-cu12 nvidia-nccl-cu11 nvidia-nccl-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu11 nvidia-nvtx-cu12  
# !pip install --user -r "retrieval_augmented_generation/requirements.txt"
# !pip install -U torch~=2.2.1
# !pip install -U bitsandbytes~=0.43

In [2]:
from retrieval_augmented_generation.rag import RAG
from retrieval_augmented_generation.configs import RetrievalConfig, TextGenerationConfig

from langchain.docstore.document import Document

from langchain_core.prompts import ChatPromptTemplate
import os
import json
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from IPython.display import clear_output

import copy

generate_anything_new = ""
while generate_anything_new != "1" and generate_anything_new != "2":
    generate_anything_new = input("Neue oder teilweise neue Test-Daten generieren (1), oder bereits generierte Test-Daten einlesen (2)?")
clear_output()
if generate_anything_new == "1":
    generate_anything_new = True
else:
    generate_anything_new = False

    
if generate_anything_new:
    
    rag = RAG(files_directory="../Wikipedia")

    retrieval_config = RetrievalConfig(collection_name="wikipedia")

    rag._init_nomodel(retrieval_config=retrieval_config)
    
    while True:
        do_update = input("Vektor-Embeddings aktualisieren? (Y/N)")
        if do_update =="Y" or do_update == "y":
            rag.update_files()
            break
        elif do_update == "N" or do_update == "n":
            break
else:
    rag = RAG(files_directory="../Wikipedia")

## Funktionen zum Speichern der Testdaten definieren: 
Damit die Tests vergleichbar sind, können die verwendeten Fragen, Antworten und Kontexte gespeichert werden. Dafür werden zwei Funktionen definiert.

In [3]:
def get_llm_test_data():
    # read file:
    try:
        with open('llm_test_data.json', 'r') as file:
            llm_test_data = json.load(file)
    except FileNotFoundError:
        llm_test_data = {}
    
    # create all not yet existant lists:
    if 'qas' not in llm_test_data:
        llm_test_data['qas'] = []
    if 'qas_with_noise' not in llm_test_data:
        llm_test_data['qas_with_noise'] = []
    if 'qas_two_answers' not in llm_test_data:
        llm_test_data['qas_two_answers'] = []
    if 'qas_with_wrong_context' not in llm_test_data:
        llm_test_data['qas_with_wrong_context'] = []
    
    # convert all documents to Document object
    for qas_name in llm_test_data:
        for (i_qa_pair, _) in enumerate(llm_test_data[qas_name]):
            for (i_doc, doc) in enumerate(llm_test_data[qas_name][i_qa_pair]['context']):
                llm_test_data[qas_name][i_qa_pair]['context'][i_doc] = Document(page_content=doc['page_content'], metadata=doc['metadata'])
    
    return llm_test_data
    
def persist_llm_test_data(llm_test_data):
    # convert all documents to dict object:
    for qas_name in llm_test_data:
        for (i_qa_pair, _) in enumerate(llm_test_data[qas_name]):
            for (i_doc, doc) in enumerate(llm_test_data[qas_name][i_qa_pair]['context']):
                if type(doc) is not dict:
                    dictionary = {}
                    dictionary['page_content'] = doc.page_content
                    dictionary['metadata'] = doc.metadata
                    llm_test_data[qas_name][i_qa_pair]['context'][i_doc] = dictionary
    
    # write file:
    with open('llm_test_data.json', 'w') as file:
        json.dump(llm_test_data, file)

## Fragen als Ausgangspunkt für die Tests generieren:

In [4]:
llm_test_data = get_llm_test_data()

os.environ["OPENAI_API_KEY"] = "XXXX"

if generate_anything_new:
    generate_new_questions = ""
    while generate_new_questions != "1" and generate_new_questions != "2":
        generate_new_questions = input("Neue Fragen generieren (1), oder bereits generierte Fragen einlesen (2)?")
    clear_output()
    if generate_new_questions == "1":
        generate_new_questions = True
    else:
        generate_new_questions = False

if generate_anything_new and generate_new_questions:
    
    # an even number of questions is optimal
    n_questions = 100

    random_docs = rag._get_random_docs(int(n_questions*2))

    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    prompt = ChatPromptTemplate.from_messages([
        ("system", "Sie sind ein hilfreicher Assistent, der dabei hilft, Fragen für eine Open-Book-Prüfung zu erstellen. Sie erhalten einen Textausschnitt, zu dem Sie eine Frage generieren, die sich mithilfe des Textausschnitts beantworten lassen muss. Es muss auch möglich sein, dass die beantwortende Person die Antwort aus einer anderen Quelle mit den gleichen Informationen recherchiert. In der Frage darf deshalb nicht davon ausgegangen werden, dass die beantwortende Person den gleichen Textausscnitt vorliegen hat. Beispiele für gute Themen für Fragen sind zum Beispiel Namen, Abkürzungen, Zahlen, Adressen oder andere nachprüfbare Fakten. Die Beispiele sind nicht zu eng zu sehen, die Antwort sollte aber maximal vier Wort lang sein. Geben Sie das Ergebnis wie in diesem Beispiel an:\n*Frage*Welche Farbe haben Bananen?*Antwort*Gelb\nEs ist möglich, dass der gegebene Textausschnitt keine zufriedenstellende Grundlage bietet, um ein Frage-Antwort-Paar zu generieren, etwa weil der Text zu unspezifisch ist, keinen sinnvollen Inhalt hat, die beantwortende Person nicht wissen könnte, worauf sich die Frage bezieht, oder andere Gründe. Es ist für Sie besser, bei Unsicherheit kein Frage-Antwort-Paar zu generieren. Antworten Sie in diesem Fall folgendermaßen:\n*Textausschnitt ungenügend*"),
        ("human", "{document}"),
    ])
    llm_chain = LLMChain(prompt=prompt, llm=llm)

    qas = []
    while len(qas) < n_questions:
        if len(random_docs) == 0:
            random_docs = rag._get_random_docs(int(n_questions*2))

        doc = random_docs.pop()
        gpt_reply = llm_chain.invoke({'document':doc.page_content})['text']
        if "*Textausschnitt ungenügend*" in gpt_reply:
            print("Kontext ungenügend\n\nKontext:\n", doc.page_content, "\n")
            print("-----------------------------------------------------------\n")
        else:
            qa_pair = gpt_reply.replace("*Frage*","").split("*Antwort*")
            qa_pair = {'question': qa_pair[0], 'answer': qa_pair[1], 'context':[doc]}
            print("Frage:\n", qa_pair['question'], "\n\nAntwort:\n", qa_pair['answer'], "\n\nKontext:\n", qa_pair['context'][0].page_content, "\n")
            while True:
                is_good = input("Zufriedenstellend? (Y/N)")
                if is_good =="Y" or is_good == "y":
                    qas.append(qa_pair)
                    clear_output()
                    break
                elif is_good == "N" or is_good == "n":
                    clear_output()
                    break
        
        
    print("Erfolgreich", len(qas), "neue Frage-Antwort-Paare generiert")    
    llm_test_data['qas'] = qas
    persist_llm_test_data(llm_test_data)


### Noise Robustness Test:
Fragen mit relevantem Kontext und zufälligem Kontext kombinieren.

In [5]:
n_max_contexts = 5

llm_test_data = get_llm_test_data()

if generate_anything_new:
    
    generate_new_noise = ""
    while generate_new_noise != "1" and generate_new_noise != "2":
        generate_new_noise = input("Neue Noise generieren (1), oder bereits generierte Fragen und Noise einlesen (2)?")
    clear_output()
    if generate_new_noise == "1":
        generate_new_noise = True
    else:
        generate_new_noise = False

if generate_anything_new and generate_new_noise:
    
    qas_with_noise = copy.deepcopy(llm_test_data['qas'])

    for qa_pair in qas_with_noise:
        random_docs = rag._get_random_docs(n_max_contexts-1)
        for doc in random_docs:
            qa_pair['context'].append(doc)
    
    print("Erfolgreich", len(qas_with_noise), "Frage-Antwort-Paare mit zufälligem Kontext kombiniert")  
    llm_test_data['qas_with_noise'] = qas_with_noise
    persist_llm_test_data(llm_test_data)

### Information Integration Test:

Zwei Frage-Antwort-Paare zu einem Frage-Antwort-Paar kombinieren.

In [6]:
llm_test_data = get_llm_test_data()

if generate_anything_new:
    generate_new_question_combinations = ""
    while generate_new_question_combinations != "1" and generate_new_question_combinations != "2":
        generate_new_question_combinations = input("Neue Kombinationen aus Zwei Frage-Antwort-Paaren generieren (1), oder bereits generierte einlesen (2)?")
    clear_output()
    if generate_new_question_combinations == "1":
        generate_new_question_combinations = True
    else:
        generate_new_question_combinations = False

if generate_anything_new and generate_new_question_combinations:
    
    qas_two_answers_tmp = copy.deepcopy(llm_test_data['qas'])
    
    # remove last element, if len is an odd number
    if len(qas_two_answers_tmp) % 2 == 1:
        qas_two_answers_tmp = qas_two_answers_tmp[:-1]
    
    qas_two_answers = []
    qas_two_answers_2ndhalf = []
    for (i, x) in enumerate(qas_two_answers_tmp):
        if i % 2 != 0:
            qas_two_answers.append(x)
        else:
            qas_two_answers_2ndhalf.append(x)
    
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
    prompt = ChatPromptTemplate.from_messages([
        ("system", "Sie sind ein hilfreicher Assistent, der dabei hilft, zwei in zwei Sätzen formulierte Fragen, in einem Satz zu formulieren. Es muss unbedingt der Inhalt beider Fragen vollständig vorhanden bleiben. Antworten Sie nur mit dem resultierenden Satz."),
        ("human", "{question1}\n{question2}"),
    ])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    
    for i_question in range(len(qas_two_answers)):
        question_1 = qas_two_answers[i_question]['question']
        answer_1 = qas_two_answers[i_question]['answer']
        question_2 = qas_two_answers_2ndhalf[i_question]['question']
        answer_2 = qas_two_answers_2ndhalf[i_question]['answer']
        
        qas_two_answers[i_question].pop('answer')
        qas_two_answers[i_question]['answer1'] = answer_1
        qas_two_answers[i_question]['answer2'] = answer_2
        context2 = qas_two_answers_2ndhalf[i_question]['context'].pop()
        qas_two_answers[i_question]['context'].append(context2)
        
        looking_for_good_question = True
        while looking_for_good_question:
            gpt_reply = llm_chain.invoke({'question1': question_1, 'question2': question_2})['text']
            print("Frage 1:\n", question_1, "\nFrage 2:\n", question_2, "\n\nkombinierte Frage:\n", gpt_reply, "\n")
            while True:
                    is_good = input("Zufriedenstellend? (Y/N)")
                    if is_good =="Y" or is_good == "y":
                        qas_two_answers[i_question]['question'] = gpt_reply
                        looking_for_good_question = False
                        clear_output()
                        break
                    elif is_good == "N" or is_good == "n":
                        clear_output()
                        break
        
    print("Erfolgreich", len(qas_two_answers), "Mal zwei Frage-Antwort-Paare kombiniert")  
    llm_test_data['qas_two_answers'] = qas_two_answers
    persist_llm_test_data(llm_test_data)

### Negative Rejection Test:
Fragen mit zufälligem Kontext kombinieren.

In [7]:
llm_test_data = get_llm_test_data()

if generate_anything_new:
    generate_new_context = ""
    while generate_new_context != "1" and generate_new_context != "2":
        generate_new_context = input("Neuen Kontext zu Fragen generieren (1), oder bereits generierte Fragen und Kontext einlesen (2)?")
    clear_output()
    if generate_new_context == "1":
        generate_new_context = True
    else:
        generate_new_context = False

if generate_anything_new and generate_new_context:
    
    qas_with_wrong_context = copy.deepcopy(llm_test_data['qas'])

    for qa_pair in qas_with_wrong_context:
        qa_pair['context'].pop()
        random_doc = rag._get_random_docs(1)[0]
        qa_pair['context'].append(random_doc)
    
    print("Erfolgreich", len(qas_with_wrong_context), "Frage-Antwort-Paare mit zufälligem Kontext kombiniert")  
    llm_test_data['qas_with_wrong_context'] = qas_with_wrong_context
    persist_llm_test_data(llm_test_data)

# Durchführung der Tests
## Initialisierung eines RAG Objekts mit dem zu verwendenden Modell


In [None]:
## rag = RAG(files_directory="../Dokumente")

openai_or_huggingface = ""
while openai_or_huggingface != "1" and openai_or_huggingface != "2":
    openai_or_huggingface = input("Mit OpenAI-Modell initialisieren (1), oder mit Huggingface-Modell initialisieren (2)?")
clear_output(wait=True)

models = []

if openai_or_huggingface == "1":
    models = {
        "gpt-3.5-turbo-0125": TextGenerationConfig(text_generation_model_name="gpt-3.5-turbo-0125"),
        "gpt-4o-2024-05-13": TextGenerationConfig(text_generation_model_name="gpt-4o-2024-05-13"),
        
        "gpt-4o-2024-05-13_new-template": TextGenerationConfig(text_generation_model_name="gpt-4o-2024-05-13")
    }
elif openai_or_huggingface == "2":
    models = {
        "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationConfig(text_generation_model_name="mistralai/Mistral-7B-Instruct-v0.3"),
        "LeoLM/leo-mistral-hessianai-7b-chat": TextGenerationConfig(text_generation_model_name="LeoLM/leo-mistral-hessianai-7b-chat"),
        # can't get chat template to work:
        "flozi00/Mistral-7B-german-assistant-v4 - custom template": TextGenerationConfig(text_generation_model_name="flozi00/Mistral-7B-german-assistant-v4", custom_chat_template="{system-message} </s>### User: {user-message} </s>### Assistant:"),
        # can't get chat template to work:
        "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": TextGenerationConfig(text_generation_model_name="OpenBuddy/openbuddy-mistral2-7b-v20.3-32k", custom_chat_template="{system-message}\n\n{user-message}\nAntwort:"),
        
        "mistralai/Mixtral-8x7B-Instruct-v0.1": TextGenerationConfig(text_generation_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"),
        "VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct": TextGenerationConfig(text_generation_model_name="VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct"),
        "OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k": TextGenerationConfig(text_generation_model_name="OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k", custom_chat_template="{system-message}\n\n{user-message}\nAssistant:"),
        
        "meta-llama/Llama-2-7b-chat-hf": TextGenerationConfig(text_generation_model_name="meta-llama/Llama-2-7b-chat-hf"),
        "flozi00/Llama-2-7b-german-assistant-v3": TextGenerationConfig(text_generation_model_name="flozi00/Llama-2-7b-german-assistant-v3"),
        
        # wiederholter Abbruch bei >3 Kontext;
        "meta-llama/Llama-2-13b-chat-hf": TextGenerationConfig(text_generation_model_name="meta-llama/Llama-2-13b-chat-hf"),
        "flozi00/Llama-2-13b-german-assistant-v6": TextGenerationConfig(text_generation_model_name="flozi00/Llama-2-13b-german-assistant-v6"),
        "ass-a2s/Llama-2-13b-chat-german": TextGenerationConfig(text_generation_model_name="ass-a2s/Llama-2-13b-chat-german"),
        
        "meta-llama/Meta-Llama-3-8B-Instruct": TextGenerationConfig(text_generation_model_name="meta-llama/Meta-Llama-3-8B-Instruct"),
        "DiscoResearch/Llama3-DiscoLeo-Instruct-8B-v0.1": TextGenerationConfig(text_generation_model_name="DiscoResearch/Llama3-DiscoLeo-Instruct-8B-v0.1"),
        "VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct": TextGenerationConfig(text_generation_model_name="VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct"),
        "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": TextGenerationConfig(text_generation_model_name="OpenBuddy/openbuddy-llama3-8b-v21.1-8k"),
        
        "google/gemma-1.1-7b-it": TextGenerationConfig(text_generation_model_name="google/gemma-1.1-7b-it"),
        # can't get chat template to work:
        "OpenBuddy/openbuddy-gemma-7b-v19.1-4k": TextGenerationConfig(text_generation_model_name="OpenBuddy/openbuddy-gemma-7b-v19.1-4k", custom_chat_template="{system-message}\n\n{user-message}\nAssistant:"),
        # can't get chat template to work and stopping string would need to be manually added:
        "VAGOsolutions/SauerkrautLM-Gemma-7b": TextGenerationConfig(text_generation_model_name="VAGOsolutions/SauerkrautLM-Gemma-7b", custom_chat_template="{system-message}\n\n{user-message}\nASSISTANT:"),
        
        "VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct_new-template": TextGenerationConfig(text_generation_model_name="VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct"),
        "VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct_new-template": TextGenerationConfig(text_generation_model_name="VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct"),
    }

model_nr = 0


while not (model_nr >= 1 and model_nr <= len(models)):
    for (i, model_name) in enumerate(models):
        print(i+1, ": ", model_name)
    model_nr = int(input("Nr des gewünschten Text-Generation-Models:"))
model_name, text_generation_config = list(models.items())[model_nr-1]

def initialize_model():
    rag = RAG(files_directory="../Wikipedia")
    
    if openai_or_huggingface == "1":
        rag.init_openai(
            open_ai_key="XXXX",
            embedding_model_name="text-embedding-ada-002",
            text_generation_config=text_generation_config
        )
    elif openai_or_huggingface == "2":
        rag.init_huggingface(
            hf_transformers_cache_dir="./../../hf_transformers_cache",
            hf_hub_api_key="hf_XXXX",
            retrieval_config=RetrievalConfig(
                embedding_model_name="intfloat/multilingual-e5-base",
                embedding_query_template="{text}",
                retrieval_query_template="query:{question}"
            ),
            text_generation_config=text_generation_config
        )
    return rag
        
rag = initialize_model()

1 :  mistralai/Mistral-7B-Instruct-v0.3
2 :  LeoLM/leo-mistral-hessianai-7b-chat
3 :  flozi00/Mistral-7B-german-assistant-v4 - custom template
4 :  OpenBuddy/openbuddy-mistral2-7b-v20.3-32k
5 :  mistralai/Mixtral-8x7B-Instruct-v0.1
6 :  VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct
7 :  OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k
8 :  meta-llama/Llama-2-7b-chat-hf
9 :  flozi00/Llama-2-7b-german-assistant-v3
10 :  meta-llama/Llama-2-13b-chat-hf
11 :  flozi00/Llama-2-13b-german-assistant-v6
12 :  ass-a2s/Llama-2-13b-chat-german
13 :  meta-llama/Meta-Llama-3-8B-Instruct
14 :  DiscoResearch/Llama3-DiscoLeo-Instruct-8B-v0.1
15 :  VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct
16 :  OpenBuddy/openbuddy-llama3-8b-v21.1-8k
17 :  google/gemma-1.1-7b-it
18 :  OpenBuddy/openbuddy-gemma-7b-v19.1-4k
19 :  VAGOsolutions/SauerkrautLM-Gemma-7b
20 :  VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct_new-template
21 :  VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct_new-template


Nr des gewünschten Text-Generation-Models: 20




init db 0


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

## Ergebnisse generiernen und manuell bewerten

In [None]:

if not (len(llm_test_data['qas']) == len(llm_test_data['qas_with_noise']) == len(llm_test_data['qas_two_answers'])*2 == len(llm_test_data['qas_with_wrong_context'])):
    raise Exception("Fragen in unterschiedlicher Anzahl liegen vor. Test kann nicht durchgeführt werden.")

all_generated_answers = []

for generate_or_rate in range(2):
    
    qas_with_noise_correct = {}
    for i in range(n_max_contexts + 1)[1:]:
        qas_with_noise_correct[i] = 0
    qas_two_answers_correct = 0
    qas_with_wrong_context_rejected = 0

    for i_qa in range(len(llm_test_data['qas'])):
        print((i_qa+1), "/", len(llm_test_data['qas']), "\n")

        for n_contexts in range(n_max_contexts + 1)[1:]:
            if generate_or_rate == 0:
                try:
                    answer_given = rag._ask_with_custom_context(question=llm_test_data['qas_with_noise'][i_qa]['question'], context=llm_test_data['qas_with_noise'][i_qa]['context'][:n_contexts])
                    # print(answer_given)
                    answer_given = answer_given['answer']
                    to_be_rated = {
                        "question": llm_test_data["qas_with_noise"][i_qa]['question'],
                        "correct_answer": llm_test_data["qas_with_noise"][i_qa]['answer'],
                        "given_answer": answer_given
                    }
                except Exception as e:
                    print(e)
                    del rag
                    to_be_rated = {
                        "question": llm_test_data["qas_with_noise"][i_qa]['question'],
                        "correct_answer": llm_test_data["qas_with_noise"][i_qa]['answer'],
                        "given_answer": "!!!TEXT GENERATION FAILED!!!"
                    }
                if 'rag' not in locals() or 'rag' not in globals():
                    print("initializing new object")
                    rag = initialize_model()
                all_generated_answers.append(to_be_rated)
            else:
                to_be_rated = all_generated_answers.pop(0)
            print("Frage:\n" , to_be_rated['question'], "\n\nKorrekte Antwort:\n", to_be_rated['correct_answer'], "\n\ngegebene Antwort:\n", to_be_rated['given_answer'], "\n")
            if generate_or_rate == 1:
                while True:
                    is_good = input("Ist die gegebene Antwort KORREKT? (Y/N)")
                    if is_good =="Y" or is_good == "y":
                        qas_with_noise_correct[n_contexts] = qas_with_noise_correct[n_contexts] + 1
                        break
                    elif is_good == "N" or is_good == "n":
                        break
            print("\n-----------------------------------------------------------\n")

        if i_qa % 2 == 1: 
            if generate_or_rate == 0:
                try:
                    answer_given = rag._ask_with_custom_context(question=llm_test_data['qas_two_answers'][int(i_qa/2)]['question'], context=llm_test_data['qas_two_answers'][int(i_qa/2)]['context'])['answer']
                    to_be_rated = {
                        "question": llm_test_data["qas_two_answers"][int(i_qa/2)]['question'],
                        "correct_answer1": llm_test_data["qas_two_answers"][int(i_qa/2)]['answer1'],
                        "correct_answer2": llm_test_data["qas_two_answers"][int(i_qa/2)]['answer2'],
                        "given_answer": answer_given
                    }
                except Exception as e:
                    print(e)
                    del rag
                    rag = initialize_model()
                    to_be_rated = {
                        "question": llm_test_data["qas_with_noise"][i_qa]['question'],
                        "correct_answer1": llm_test_data["qas_two_answers"][int(i_qa/2)]['answer1'],
                        "correct_answer2": llm_test_data["qas_two_answers"][int(i_qa/2)]['answer2'],
                        "given_answer": "!!!TEXT GENERATION FAILED!!!"
                    }
                if 'rag' not in locals() or 'rag' not in globals():
                    print("initializing new object")
                    rag = initialize_model()
                all_generated_answers.append(to_be_rated)
            else:
                to_be_rated = all_generated_answers.pop(0)
            print("Frage:\n" , to_be_rated['question'], "\n\nKorrekte Antwort, erste Hälfte:\n", to_be_rated['correct_answer1'], "\n\nKorrekte Antwort, zweite Hälfte:\n", to_be_rated['correct_answer2'], "\n\ngegebene Antwort:\n", to_be_rated['given_answer'], "\n")
            if generate_or_rate == 1:
                while True:
                    is_good = input("Liefert die Antwort BEIDE Antworten KORREKT? (Y/N)")
                    if is_good =="Y" or is_good == "y":
                        qas_two_answers_correct = qas_two_answers_correct + 1
                        break
                    elif is_good == "N" or is_good == "n":
                        break
            print("\n-----------------------------------------------------------\n")

        if generate_or_rate == 0:
            try:
                answer_given = rag._ask_with_custom_context(question=llm_test_data['qas_with_wrong_context'][i_qa]['question'], context=llm_test_data['qas_with_wrong_context'][i_qa]['context'])['answer']
                to_be_rated = {
                    "question": llm_test_data["qas_with_wrong_context"][i_qa]['question'],
                    "given_answer": answer_given
                }
            except Exception as e:
                print(e)
                del rag
                rag = initialize_model()
                to_be_rated = {
                    "question": llm_test_data["qas_with_noise"][i_qa]['question'],
                    "given_answer": "!!!TEXT GENERATION FAILED!!!"
                }
            if 'rag' not in locals() or 'rag' not in globals():
                print("initializing new object")
                rag = initialize_model()
            all_generated_answers.append(to_be_rated)
        else:
            to_be_rated = all_generated_answers.pop(0)
        print("Frage:\n" , to_be_rated['question'], "\n\ngegebene Antwort:\n", to_be_rated['given_answer'], "\n")
        if generate_or_rate == 1:
            while True:
                is_good = input("Wird darauf hingewiesen, dass die Frage NICHT mit dem Kontext beantwortet werden kann? (Y/N)")
                if is_good =="Y" or is_good == "y":
                    qas_with_wrong_context_rejected = qas_with_wrong_context_rejected + 1
                    break
                elif is_good == "N" or is_good == "n":
                    break
        clear_output()
        #KOMMT WIEDER WEG:
        with open('mixtral_to_be_rated.json', 'w') as file:
            json.dump(to_be_rated, file)
            print("Daten gespeichert.")


try:
    with open('llm_test_results.json', 'r') as file:
        llm_test_results = json.load(file)
except FileNotFoundError:
    llm_test_results = {}
    
llm_test_results[model_name] = {}
llm_test_results[model_name]['noise_robustness'] = {}
for i in range(n_max_contexts):
    llm_test_results[model_name]['noise_robustness'][i+1] = qas_with_noise_correct[i+1] / len(llm_test_data['qas'])
llm_test_results[model_name]['information_integration'] = qas_two_answers_correct / (len(llm_test_data['qas']) / 2)
llm_test_results[model_name]['negative_rejection'] = qas_with_wrong_context_rejected / len(llm_test_data['qas'])

noise_robustness_string = "" 
for i in range(n_max_contexts):
    noise_robustness_string = noise_robustness_string + str(i) + "x Noise: " + str(llm_test_results[model_name]['noise_robustness'][i+1]) + " "
print("Noise Robustness: ", noise_robustness_string)
print("Information Integration:", str(llm_test_results[model_name]['information_integration']))
print("Negative Rejection:", str(llm_test_results[model_name]['negative_rejection']))

with open('llm_test_results.json', 'w') as file:
    json.dump(llm_test_results, file)