# Choose Model Llama 3.2 with Fine-Tuning and Lora Layers

In [1]:
import torch
print("CUDA Available: ", torch.cuda.is_available())
print("CUDA Device Name: ", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

# Verificar se CUDA está disponível para acelerar o processamento
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

CUDA Available:  True
CUDA Device Name:  NVIDIA GeForce RTX 3050 Ti Laptop GPU
Usando dispositivo: cuda


## Llama 3.2 lora (Fine-Tunned)

In [2]:
from unsloth import FastLanguageModel
import torch

2024-11-02 14:16:27.228102: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-02 14:16:27.239803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-02 14:16:27.255598: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-02 14:16:27.260176: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-02 14:16:27.271828: I tensorflow/core/platform/cpu_feature_guar

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [4]:
model_path = "../Models/llama_3.2_FT_lora_4000_questions"
# model_path = "model_3.2_lora_4bits"

# Carregar o modelo e o tokenizador separadamente
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    dtype = dtype,
    load_in_4bit=load_in_4bit
)

==((====))==  Unsloth 2024.10.6: Fast Llama patching. Transformers = 4.46.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.712 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.6. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.10.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

# Dataset TeleQnA for Inference

In [8]:
import json

# Path to the TeleQnA processed question in JSON file
rel17_100_questions_path = r"../Files/rel17_100_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_100_questions_path, "r", encoding="utf-8") as file:
    rel17_100_questions = json.load(file)
print(len(rel17_100_questions))

100


In [7]:
rel17_100_questions[0]

{'question': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'option 1': 'eNB Configuration Transfer',
 'option 2': 'Downlink RAN Configuration Transfer',
 'option 3': 'Uplink RAN Configuration Transfer',
 'option 4': 'MME Configuration Transfer',
 'answer': 'option 3: Uplink RAN Configuration Transfer',
 'explanation': 'The NGAP procedure used for inter-system load balancing is Uplink RAN Configuration Transfer.',
 'category': 'Standards overview'}

# Create prompt and Ask function for Llama 3.2 lora (With Fine-Tuning)

In [8]:
from unsloth.chat_templates import get_chat_template

def ask_llama_3_2_lora(model, tokenizer, question_data):
    """
    Function to generate an answer using the model based on the given question and options.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - question_data: Dictionary containing the question and options.

    Returns:
    - String: Model's generated response.
    """

    # Extract question and options
    question = question_data['question']
    options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]

    # Create the prompt with the question and options
    prompt = (
        f"Question: {question}\n"
        f"Options:\n" + "\n".join(options) + "\n"
        "Think step by step before answering and respond with the correct option in the format 'correct option: <X>'.\n"
    )

    # Create the input for the model
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate the response
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=2048,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )

    # Decode and return the model's output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return response


In [9]:
# Example usage
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

llama_3_2_lora_response = ask_llama_3_2_lora(model, tokenizer, question_data)
print(llama_3_2_lora_response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Question: Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]
Options:
option 1: PBCH
option 2: PCFICH
option 3: PDSCH
option 4: PHICH
Think step by step before answering and respond with the correct option in the format 'correct option: <X>'.
assistant

Answer: option 2: PCFICH
Explanation: The PDCCH format indicator channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs, which is indicated by the PCFICH.


# Accuracy evaluation

## Evaluate Question 

In [10]:
import re

def extract_option(answer):
    """
    Extract the option part from the answer string, removing all punctuation and converting to lowercase.
    
    Parameters:
    - answer: A string containing the answer in the format 'option X: ...'.

    Returns:
    - String: Extracted option (e.g., 'option 2'), or None if no match is found.
    """
    # Remove all punctuation and convert to lowercase
    cleaned_answer = re.sub(r'[^\w\s]', '', answer.lower())
    # Search for the option in the format "option X"
    match = re.search(r'option \d+', cleaned_answer)
    return match.group(0).strip() if match else None

In [11]:
def extract_response_after_assistant(response):
    """
    Extract the part of the response that comes after the 'assistant' marker.

    Parameters:
    - response: The complete response from the model.

    Returns:
    - String: The extracted relevant part of the response.
    """
    # Split the response based on the 'assistant' marker
    parts = response.split('assistant', 1)
    # Return the part after 'assistant' or the entire response if 'assistant' is not found
    return parts[1].strip() if len(parts) > 1 else response.strip()

In [12]:
def evaluate_model_response(model_response, question_data):
    """
    Compare the model's response with the correct answer from the question data.
    
    Parameters:
    - model_response: The response string generated by the model.
    - question_data: Dictionary containing the question, options, and the correct answer.

    Returns:
    - 1 if the response is correct, otherwise the extracted model option.
    """
    correct_option = extract_option(question_data['answer'])  # Extract correct option
    relevant_response = extract_response_after_assistant(model_response)  # Get relevant part of response
    model_option = extract_option(relevant_response)  # Extract model's option

    return 1 if model_option == correct_option else model_option  # Return 1 if correct, else model's option


In [13]:
question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

In [14]:
evaluation_result = evaluate_model_response(llama_3_2_lora_response, question_data)
print(evaluation_result)

1


## Ask to model Llama 3.2 lora TeleQnA 100 question 

In [15]:
def evaluate_questions(model, tokenizer, questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_llama_3_2_lora(model, tokenizer, question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['answer'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [16]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
responses_llama_3_2_lora = evaluate_questions(model, tokenizer, rel17_100_questions)

Responded 1 of 100 questions...
Responded 2 of 100 questions...
Responded 3 of 100 questions...
Responded 4 of 100 questions...
Responded 5 of 100 questions...
Responded 6 of 100 questions...
Responded 7 of 100 questions...
Responded 8 of 100 questions...
Responded 9 of 100 questions...
Responded 10 of 100 questions...
Responded 11 of 100 questions...
Responded 12 of 100 questions...
Responded 13 of 100 questions...
Responded 14 of 100 questions...
Responded 15 of 100 questions...
Responded 16 of 100 questions...
Responded 17 of 100 questions...
Responded 18 of 100 questions...
Responded 19 of 100 questions...
Responded 20 of 100 questions...
Responded 21 of 100 questions...
Responded 22 of 100 questions...
Responded 23 of 100 questions...
Responded 24 of 100 questions...
Responded 25 of 100 questions...
Responded 26 of 100 questions...
Responded 27 of 100 questions...
Responded 28 of 100 questions...
Responded 29 of 100 questions...
Responded 30 of 100 questions...
Responded 31 of 100

In [17]:
print(responses_llama_3_2_lora[0]['response'])

system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Question: Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]
Options:
option 1: eNB Configuration Transfer
option 2: Downlink RAN Configuration Transfer
option 3: Uplink RAN Configuration Transfer
option 4: MME Configuration Transfer
Think step by step before answering and respond with the correct option in the format 'correct option: <X>'.
assistant

Answer: option 3: Uplink RAN Configuration Transfer
Explanation: The Uplink RAN Configuration Transfer procedure is used for inter-system load balancing.


## Save accuracy responses

In [18]:

def save_responses_to_json(responses, filename):
    """
    Save the model responses to a JSON file.
    
    Parameters:
    - responses: List of responses to save.
    - filename: Name of the JSON file.
    """
    
    with open(filename, "w") as json_file:
        json.dump(responses, json_file, indent=4)

In [19]:
save_responses_to_json(responses_llama_3_2_lora,"../Models_responses/Accuracy/llama_3.2_lora_responses_2.json")

## Evaluate responses from Llama 3.2 lora

In [20]:
import json

# Load responses from the JSON file
with open("../Models_responses/Accuracy/llama_3.2_lora_responses_2.json", "r") as file:
    responses_llama_3_2_lora = json.load(file)

# Print the loaded responses to verify
print("Responses loaded:")
# for response in responses_llama_3_2_lora:
#     print(response)


Responses loaded:
{'question': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]', 'answer': 'option 3: Uplink RAN Configuration Transfer', 'response': "system\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nuser\n\nQuestion: Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]\nOptions:\noption 1: eNB Configuration Transfer\noption 2: Downlink RAN Configuration Transfer\noption 3: Uplink RAN Configuration Transfer\noption 4: MME Configuration Transfer\nThink step by step before answering and respond with the correct option in the format 'correct option: <X>'.\nassistant\n\nAnswer: option 3: Uplink RAN Configuration Transfer\nExplanation: The Uplink RAN Configuration Transfer procedure is used for inter-system load balancing."}
{'question': 'What is covered by enhanced application layer support for V2X services? [3GPP Release 17]', 'answer': 'option 2: Advanced V2X services', 'response': "system\n\nCutting 

In [21]:
def evaluate_accuracy(responses_llama_3_2_lora):
    """
    Evaluate the model's responses and calculate accuracy.
    """
    correct_count = 0  # Track the number of correct responses
    none_count = 0  # Track the number of 'None' responses

    for index, question_data in enumerate(responses_llama_3_2_lora):
        evaluation_result = evaluate_model_response(question_data['response'], question_data)
        options = [f"{key}: {value}" for key, value in rel17_100_questions[index].items() if 'option' in key]

        if evaluation_result == 1:
            correct_count += 1  # Increment for correct response
        elif evaluation_result is None:
            # Print only responses that are None
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Correct response: {question_data['answer']}")
            print(f"Full model response:\n{question_data['response']}")
            print("----------------------------------------------------------------------------------------")
            none_count += 1  # Increment for None response
        else:
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Correct response: {question_data['answer']}")
            print(f"Model response: {evaluation_result}")
            print("----------------------------------------------------------------------------------------")

    # Calculate and print accuracy
    accuracy = correct_count / len(responses_llama_3_2_lora) * 100
    print(f"\nAccuracy: {accuracy:.2f}%")
    print(f"Total 'None' responses: {none_count}")
    print(f"'None' responses means that the model did not give an option")


In [22]:
evaluate_accuracy(responses_llama_3_2_lora)


Wrong Answer
Question 2: What is covered by enhanced application layer support for V2X services? [3GPP Release 17]
Options:
option 1: PC5 radio resource control
option 2: Advanced V2X services
option 3: SDAP layer enhancements
option 4: V2X communication over NR PC5 reference point
option 5: Tele-Operated Driving

Correct response: option 2: Advanced V2X services
Model response: option 3
----------------------------------------------------------------------------------------

Wrong Answer
Question 3: What does the Load-Balancing steering mode do? [3GPP Release 17]
Options:
option 1: Splits the traffic of a data flow across 3GPP and non-3GPP accesses
option 2: Balances the load on the core network nodes
option 3: Restricts the UE to a single access type
option 4: Improves network performance measurements

Correct response: option 1: Splits the traffic of a data flow across 3GPP and non-3GPP accesses
Model response: option 2
--------------------------------------------------------------

# RAGAS evaluation

## Create prompt with no option and Ask function for Llama 3.2 lora

In [23]:
from unsloth.chat_templates import get_chat_template

def ask_llama_3_2_no_options(model, tokenizer, question_data):
    """
    Function to generate an answer using the model based on the given question and options.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - question_data: Dictionary containing the question and options.

    Returns:
    - String: Model's generated response.
    """

    # Extract question and options
    question = question_data['question']
    options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]

    # Create the prompt with the question and options
    prompt = (
        f"Question: {question}\n"
        "Think step by step before answering and respond with a final answer in the format 'answer: <XXXXX>'."
    )

    # Create the input for the model
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate the response
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=2048,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )

    # Decode and return the model's output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return response


In [24]:
# Example usage
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

llama_3_2_response_text = ask_llama_3_2_no_options(model, tokenizer, question_data)
print(llama_3_2_response_text)

system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Question: Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]
Think step by step before answering and respond with a final answer in the format 'answer: <XXXXX>'.assistant

The PDCCH format 2_4 can inform the UE and the RN about the number of OFDM symbols used for the PDCCHs via the MPDU-PM symbol information.


In [18]:
def format_answer(answer):
    # Remove punctuation and convert to lowercase
    answer_no_punctuation = answer.translate(str.maketrans('', '', string.punctuation))
    return answer_no_punctuation.lower()

In [19]:
import re
import string

def extract_answer(text):
    # Check for the presence of 'assistant'
    assistant_match = re.search(r'assistant\s*(.*)', text, re.IGNORECASE | re.DOTALL)
    if assistant_match:
        # If 'assistant' is found, get the text that follows it
        assistant_text = assistant_match.group(1).strip()

        # Find all occurrences of 'answer:' and capture the text after the last one
        answer_matches = re.findall(r'answer:\s*(.*)', assistant_text, re.IGNORECASE | re.DOTALL)
        
        # Return the phrase after the last 'answer:' found
        return answer_matches[-1].strip() if answer_matches else assistant_text

    # Return None if 'assistant' is not found
    return None


In [20]:
def extract_option(text):
    # Find all occurrences of 'option X:' followed by text, where X can be any number
    option_matches = re.findall(r'option\s*\d+:\s*(.*)', text, re.IGNORECASE | re.DOTALL)
    
    # Return the text after the last 'option X:' found
    return option_matches[-1].strip() if option_matches else None

In [27]:
extracted_answer = extract_answer(llama_3_2_response_text)
print(extracted_answer)

The PDCCH format 2_4 can inform the UE and the RN about the number of OFDM symbols used for the PDCCHs via the MPDU-PM symbol information.


In [28]:
model_response = format_answer(extracted_answer)
print(model_response)

the pdcch format 24 can inform the ue and the rn about the number of ofdm symbols used for the pdcchs via the mpdupm symbol information


In [29]:
correct_answer = format_answer(question_data['explanation'])
print(correct_answer)

the physical control format indicator channel pcfich informs the ue and the rn about the number of ofdm symbols used for the pdcchs


## Model Groq for RAGAS evaluation

In [14]:
import os

if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

In [15]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    # model="llama-3.1-70b-versatile",
    model="llama3-70b-8192",
    # model="llama3-groq-70b-8192-tool-use-preview",
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [16]:
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
2024-11-06 19:34:34.113804: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 19:34:34.123850: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 19:34:34.139489: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 19:34:34.144228: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has alrea

## Ask to model Llama 3.2 lora TeleQnA 100 question with no options

In [33]:
def evaluate_questions_no_options(model, tokenizer, questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_llama_3_2_no_options(model, tokenizer, question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['explanation'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [45]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
llama_3_2_responses_RAGAS = evaluate_questions_no_options(model, tokenizer, rel17_100_questions)

Responded 1 of 100 questions...
Responded 2 of 100 questions...
Responded 3 of 100 questions...
Responded 4 of 100 questions...
Responded 5 of 100 questions...
Responded 6 of 100 questions...
Responded 7 of 100 questions...
Responded 8 of 100 questions...
Responded 9 of 100 questions...
Responded 10 of 100 questions...
Responded 11 of 100 questions...
Responded 12 of 100 questions...
Responded 13 of 100 questions...
Responded 14 of 100 questions...
Responded 15 of 100 questions...
Responded 16 of 100 questions...
Responded 17 of 100 questions...
Responded 18 of 100 questions...
Responded 19 of 100 questions...
Responded 20 of 100 questions...
Responded 21 of 100 questions...
Responded 22 of 100 questions...
Responded 23 of 100 questions...
Responded 24 of 100 questions...
Responded 25 of 100 questions...
Responded 26 of 100 questions...
Responded 27 of 100 questions...
Responded 28 of 100 questions...
Responded 29 of 100 questions...
Responded 30 of 100 questions...
Responded 31 of 100

In [46]:
print(llama_3_2_responses_RAGAS[0]['question'])
print(extract_answer(llama_3_2_responses_RAGAS[0]['response']))
print(llama_3_2_responses_RAGAS[0]['answer'])

Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]
The inter-system load balancing procedure is used for load balancing between the E-UTRAN and UTRA.
The NGAP procedure used for inter-system load balancing is Uplink RAN Configuration Transfer.


In [47]:
# save_responses_to_json(llama_3_2_responses_RAGAS,"../Models_responses/RAGAS/llama_3.2_lora_responses_RAGAS_2.json")

## Build Dataset for Evaluation with RAGAS

In [49]:
import json

# Path to the TeleQnA processed question in JSON file
llama_3_2_responses_RAGAS_path = r"../Models_responses/RAGAS/llama_3.2_lora_responses_RAGAS.json"

# Load the TeleQnA data just release 17
with open(llama_3_2_responses_RAGAS_path, "r", encoding="utf-8") as file:
    llama_3_2_responses_RAGAS = json.load(file)
print(len(llama_3_2_responses_RAGAS))

100


In [50]:
from datasets import Dataset 

In [51]:
def transform_dataset(data):
    """Transform the dataset to the required format."""
    transformed_data = {
        'user_input': [],
        'response': [],
        'reference': []
    }

    for item in data:
        # print(f"\n{item['question']}\n{item['answer']}\n{item['response']}")
        question = item['question']
        model_response = format_answer(extract_answer(item['response']))
        correct_answer = format_answer(item['answer'])
        # model_response = (extract_answer(item['response']))
        # correct_answer = (item['answer'])

        transformed_data['user_input'].append(question)
        transformed_data['response'].append(model_response)
        transformed_data['reference'].append(correct_answer)

    return transformed_data

In [52]:
# Transform the llama_3_2_responses_RAGAS dataset
data_samples = transform_dataset(llama_3_2_responses_RAGAS[:20])
# data_samples = transform_dataset(llama_3_2_responses_RAGAS)

# Create the dataset object
dataset = Dataset.from_dict(data_samples)

# Print to verify the structure
print(dataset)

Dataset({
    features: ['user_input', 'response', 'reference'],
    num_rows: 100
})


In [53]:
dataset[0]

{'user_input': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'response': 'the intersystem load balancing procedure is used for load balancing between the eutran and utra',
 'reference': 'the ngap procedure used for intersystem load balancing is uplink ran configuration transfer'}

## Evaluate Llama 3.2 lora with RAGAS Metrics

### Using LLM to evaluate (Factual Correctness, Semantic similarity and Rubrics based criteria scoring)

In [54]:
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.metrics import RubricsScoreWithReference

In [55]:
factualCorrectness = FactualCorrectness()
semantiSimilarity = SemanticSimilarity()
rubrics = {
    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}
rubricsScoreWithReference =  RubricsScoreWithReference(rubrics=rubrics)

In [57]:
score = evaluate(
    dataset,
    metrics=[
        factualCorrectness,
        semantiSimilarity,
        rubricsScoreWithReference,
    ],
    llm=llm,
    embeddings=embeddings,
    run_config = RunConfig(timeout=400, max_retries=20, max_wait=120,log_tenacity=False),
)
score.to_pandas()

Evaluating:   0%|          | 0/300 [00:00<?, ?it/s]

Exception raised in Job[30]: TimeoutError()
Exception raised in Job[51]: TimeoutError()
Exception raised in Job[240]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[147]: TimeoutError()
Exception raised in Job[162]: TimeoutError()
Exception raised in Job[228]: TimeoutError()
Exception raised in Job[216]: TimeoutError()
Exception raised in Job[42]: TimeoutError()
Exception raised in Job[69]: TimeoutError()
Exception raised in Job[96]: TimeoutError()
Exception raised in Job[90]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[114]: TimeoutError()
Exception raised in Job[285]: TimeoutError()
Exception raised in Job[213]: TimeoutError()
Exception raised in Job[249]: TimeoutError()
Exception raised in Job[264]: TimeoutError()
Exception raised in Job[159]: TimeoutError()
Exception raised in Job[258]: TimeoutError()
Exception raised in Job[93]: TimeoutError()
Exception raised in Job[54]: TimeoutError()
Exception raised in J

Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,the intersystem load balancing procedure is us...,the ngap procedure used for intersystem load b...,,0.506876,2
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,enhanced application layer support for v2x ser...,,0.621993,2
2,What does the Load-Balancing steering mode do?...,it directs the uep flow to another enb by upda...,the loadbalancing steering mode splits the tra...,,0.408990,2
3,What is the main objective of intent driven ma...,intent driven management is the main objective...,the intent driven management aims to reduce th...,,0.785138,3
4,What does MINT stand for? [3GPP Release 17],multiip network testbed\nexplanation mint stan...,mint stands for minimization of service interr...,,0.439424,1
...,...,...,...,...,...,...
95,Which RRC state is the UE in when no RRC conne...,cs mode is the rrc state of the ue when no rrc...,when no rrc connection is established the ue i...,,,1
96,How are the antenna elements placed on each an...,the antenna elements for a small cell site are...,the document states that the antenna elements ...,0.00,0.579674,2
97,What information may be provided to an emergen...,lcs positioningrelated information\ninformatio...,for emergency services the geographic location...,0.86,0.823607,2
98,What is the purpose of cross-network slice coo...,crossnetwork slice coordination involves estab...,crossnetwork slice coordination enables the co...,0.67,0.803510,3


In [58]:
score

{'factual_correctness': 0.1620, 'semantic_similarity': 0.6839, 'rubrics_score_with_reference': 2.1700}

In [59]:
llama_3_2_evaluation_RAGAS_LLM = score.to_pandas()
# llama_3_2_evaluation_RAGAS_LLM.to_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_LLM.csv", index=False)

In [60]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_LLM.csv")

In [61]:
result

Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,the intersystem load balancing procedure is us...,the ngap procedure used for intersystem load b...,,0.506876,2
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,enhanced application layer support for v2x ser...,,0.621993,2
2,What does the Load-Balancing steering mode do?...,it directs the uep flow to another enb by upda...,the loadbalancing steering mode splits the tra...,,0.408990,2
3,What is the main objective of intent driven ma...,intent driven management is the main objective...,the intent driven management aims to reduce th...,,0.785138,3
4,What does MINT stand for? [3GPP Release 17],multiip network testbed\nexplanation mint stan...,mint stands for minimization of service interr...,,0.439424,1
...,...,...,...,...,...,...
95,Which RRC state is the UE in when no RRC conne...,cs mode is the rrc state of the ue when no rrc...,when no rrc connection is established the ue i...,,,1
96,How are the antenna elements placed on each an...,the antenna elements for a small cell site are...,the document states that the antenna elements ...,0.00,0.579674,2
97,What information may be provided to an emergen...,lcs positioningrelated information\ninformatio...,for emergency services the geographic location...,0.86,0.823607,2
98,What is the purpose of cross-network slice coo...,crossnetwork slice coordination involves estab...,crossnetwork slice coordination enables the co...,0.67,0.803510,3


### No need LLM to evaluate (BleuScore, RougeScore, ExactMatch and StringPresence)

In [62]:
from ragas.metrics import BleuScore, RougeScore, ExactMatch, StringPresence
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arimatea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [63]:
bleuScore = BleuScore()
rougeScore = RougeScore()
exactMatch = ExactMatch()
stringPresence = StringPresence()

In [64]:
score = evaluate(
    dataset,
    metrics=[
        bleuScore,
        rougeScore,
        exactMatch,
        stringPresence
    ],
    llm=llm,
    embeddings=embeddings
)
score.to_pandas()

Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
ERROR:ragas.executor:Exception raised in Job[316]: AssertionError(Expecting a float)
ERROR:ragas.executor:Exception raised in Job[140]: AssertionError(The number of hypotheses and their reference(s) should be the same )
ERROR:ragas.executor:Exception raised in Job[388]: AssertionError(The number of hypotheses and their refe

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,the intersystem load balancing procedure is us...,the ngap procedure used for intersystem load b...,3.739799e-78,0.428571,0.0,0.0
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,enhanced application layer support for v2x ser...,1.423916e-01,0.347826,0.0,0.0
2,What does the Load-Balancing steering mode do?...,it directs the uep flow to another enb by upda...,the loadbalancing steering mode splits the tra...,1.108790e-231,0.139535,0.0,0.0
3,What is the main objective of intent driven ma...,intent driven management is the main objective...,the intent driven management aims to reduce th...,2.326889e-78,0.311111,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],multiip network testbed\nexplanation mint stan...,mint stands for minimization of service interr...,,0.352941,0.0,0.0
...,...,...,...,...,...,...,...
95,Which RRC state is the UE in when no RRC conne...,cs mode is the rrc state of the ue when no rrc...,when no rrc connection is established the ue i...,3.850323e-01,0.444444,0.0,0.0
96,How are the antenna elements placed on each an...,the antenna elements for a small cell site are...,the document states that the antenna elements ...,3.843139e-78,0.352941,0.0,0.0
97,What information may be provided to an emergen...,lcs positioningrelated information\ninformatio...,for emergency services the geographic location...,,0.648649,0.0,0.0
98,What is the purpose of cross-network slice coo...,crossnetwork slice coordination involves estab...,crossnetwork slice coordination enables the co...,3.773769e-78,0.372093,0.0,0.0


In [65]:
score

{'bleu_score': 0.1570, 'rouge_score': 0.3724, 'exact_match': 0.0000, 'string_present': 0.0000}

In [66]:
llama_3_2_evaluation_RAGAS_no_LLM = score.to_pandas()
# llama_3_2_evaluation_RAGAS_no_LLM.to_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_no_LLM.csv", index=False)

In [67]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_no_LLM.csv")

In [68]:
result

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,the intersystem load balancing procedure is us...,the ngap procedure used for intersystem load b...,3.739799e-78,0.428571,0.0,0.0
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,enhanced application layer support for v2x ser...,1.423916e-01,0.347826,0.0,0.0
2,What does the Load-Balancing steering mode do?...,it directs the uep flow to another enb by upda...,the loadbalancing steering mode splits the tra...,1.108790e-231,0.139535,0.0,0.0
3,What is the main objective of intent driven ma...,intent driven management is the main objective...,the intent driven management aims to reduce th...,2.326889e-78,0.311111,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],multiip network testbed\nexplanation mint stan...,mint stands for minimization of service interr...,,0.352941,0.0,0.0
...,...,...,...,...,...,...,...
95,Which RRC state is the UE in when no RRC conne...,cs mode is the rrc state of the ue when no rrc...,when no rrc connection is established the ue i...,3.850323e-01,0.444444,0.0,0.0
96,How are the antenna elements placed on each an...,the antenna elements for a small cell site are...,the document states that the antenna elements ...,3.843139e-78,0.352941,0.0,0.0
97,What information may be provided to an emergen...,lcs positioningrelated information\ninformatio...,for emergency services the geographic location...,,0.648649,0.0,0.0
98,What is the purpose of cross-network slice coo...,crossnetwork slice coordination involves estab...,crossnetwork slice coordination enables the co...,3.773769e-78,0.372093,0.0,0.0


## Build Dataset for Evaluation with RAGAS -  Reference is 'Correct Option Text' in rel17_100_questions

In [21]:
import json

# Path to the TeleQnA processed question in JSON file
llama_3_2_responses_RAGAS_path = r"../Models_responses/RAGAS/llama_3.2_lora_responses_RAGAS.json"

# Load the TeleQnA data just release 17
with open(llama_3_2_responses_RAGAS_path, "r", encoding="utf-8") as file:
    llama_3_2_responses_RAGAS = json.load(file)
print(len(llama_3_2_responses_RAGAS))

100


In [22]:
from datasets import Dataset

In [23]:
def transform_dataset(data):
    """Transform the dataset to the required format."""
    transformed_data = {
        'user_input': [],
        'response': [],
        'reference': []
    }
    
    idx=0
    for item in data:
        # print(f"\n{item['question']}\n{item['answer']}\n{item['response']}")
        question = item['question']
        model_response = format_answer(extract_answer(item['response']))
        correct_answer = format_answer(extract_option(rel17_100_questions[idx]['answer']))
        idx += 1
        # model_response = (extract_answer(item['response']))
        # correct_answer = (item['answer'])
        
        # Ensure model_response and correct_answer end with a period
        model_response = model_response.rstrip('.') + '.'
        correct_answer = correct_answer.rstrip('.') + '.'

        transformed_data['user_input'].append(question)
        transformed_data['response'].append(model_response)
        transformed_data['reference'].append(correct_answer)

    return transformed_data

In [24]:
# Transform the llama_3_2_responses_RAGAS dataset
data_samples = transform_dataset(llama_3_2_responses_RAGAS[:20])
# data_samples = transform_dataset(llama_3_2_responses_RAGAS)

# Create the dataset object
dataset = Dataset.from_dict(data_samples)

# Print to verify the structure
print(dataset)

Dataset({
    features: ['user_input', 'response', 'reference'],
    num_rows: 20
})


In [25]:
dataset[0]

{'user_input': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'response': 'ranintnagapranintrrcnextprocedureintersystem loadbalancingricnotificationrequestprocedure is used for intersystem load balancing.',
 'reference': 'uplink ran configuration transfer.'}

## Evaluate Llama 3.2 with RAGAS Metrics - Comparing with 'Correct Option Text' in rel17_100_questions

### Using LLM to evaluate (Factual Correctness, Semantic similarity and Rubrics based criteria scoring)

In [26]:
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.metrics import RubricsScoreWithReference

In [27]:
factualCorrectness = FactualCorrectness()
semantiSimilarity = SemanticSimilarity()
rubrics = {
    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}
rubricsScoreWithReference =  RubricsScoreWithReference(rubrics=rubrics)

In [28]:
score = evaluate(
    dataset,
    metrics=[
        factualCorrectness,
        semantiSimilarity,
        rubricsScoreWithReference,
    ],
    llm=llm,
    embeddings=embeddings,
    run_config = RunConfig(timeout=400, max_retries=20, max_wait=120,log_tenacity=False),
)
score.to_pandas()

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[51]: TimeoutError()
Exception raised in Job[48]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[30]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[33]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[36]: TimeoutError()
Exception raised in Job[18]: TimeoutError()
Exception raised in Job[0]: TimeoutError()
Exception raised in Job[57]: TimeoutError()
Exception raised in Job[39]: TimeoutError()
Exception raised in Job[21]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[42]: TimeoutError()


Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,ranintnagapranintrrcnextprocedureintersystem l...,uplink ran configuration transfer.,,0.205584,1
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,advanced v2x services.,,0.705915,3
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode shifts data to...,splits the traffic of a data flow across 3gpp ...,,0.286293,2
3,What is the main objective of intent driven ma...,the main objective of intent driven management...,to reduce the complexity of management for net...,0.33,0.513481,3
4,What does MINT stand for? [3GPP Release 17],mint stands for mobility internet services ter...,minimization of service interruption.,,0.13187,1
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,,0.649886,2
6,What is the purpose of load-balancing steering...,loadbalancing steering mode enhancements impro...,to enable the ue and upf to freely select spli...,,0.245727,2
7,What is a capability added in the V2X Applicat...,the v2x application enabler vae adds network c...,v2x service discovery.,,0.490463,1
8,What is the purpose of the Edge Data Network (...,the purpose of the edge data network edn is to...,to host the edge application servers and edge ...,0.0,0.573311,2
9,What are the three features specified in TS 23...,the three features specified in ts 23304 for 5...,broadcast mode groupcast mode unicast mode.,0.0,0.326036,1


In [33]:
score

{'factual_correctness': 0.1320, 'semantic_similarity': 0.3655, 'rubrics_score_with_reference': 2.0500}

In [34]:
llama_3_2_evaluation_RAGAS_LLM = score.to_pandas()
# llama_3_2_evaluation_RAGAS_LLM.to_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_LLM_correct_option.csv", index=False)

In [35]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_LLM_correct_option.csv")

In [36]:
result.head(20)

Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,ranintnagapranintrrcnextprocedureintersystem l...,uplink ran configuration transfer.,,0.205584,1
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,advanced v2x services.,,0.705915,3
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode shifts data to...,splits the traffic of a data flow across 3gpp ...,,0.286293,2
3,What is the main objective of intent driven ma...,the main objective of intent driven management...,to reduce the complexity of management for net...,0.33,0.513481,3
4,What does MINT stand for? [3GPP Release 17],mint stands for mobility internet services ter...,minimization of service interruption.,,0.13187,1
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,,0.649886,2
6,What is the purpose of load-balancing steering...,loadbalancing steering mode enhancements impro...,to enable the ue and upf to freely select spli...,,0.245727,2
7,What is a capability added in the V2X Applicat...,the v2x application enabler vae adds network c...,v2x service discovery.,,0.490463,1
8,What is the purpose of the Edge Data Network (...,the purpose of the edge data network edn is to...,to host the edge application servers and edge ...,0.0,0.573311,2
9,What are the three features specified in TS 23...,the three features specified in ts 23304 for 5...,broadcast mode groupcast mode unicast mode.,0.0,0.326036,1


### No need LLM to evaluate (BleuScore, RougeScore, ExactMatch and StringPresence)

In [37]:
from ragas.metrics import BleuScore, RougeScore, ExactMatch, StringPresence
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arimatea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
bleuScore = BleuScore()
rougeScore = RougeScore()
exactMatch = ExactMatch()
stringPresence = StringPresence()

In [39]:
score = evaluate(
    dataset,
    metrics=[
        bleuScore,
        rougeScore,
        exactMatch,
        stringPresence
    ],
    llm=llm,
    embeddings=embeddings
)
score.to_pandas()

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
ERROR:ragas.executor:Exception raised in Job[52]: AssertionError(The number of hypotheses and their reference(s) should be the same )
ERROR:ragas.executor:Exception raised in Job[16]: AssertionError(The number of hypotheses and their reference(s) should be the same )
ERROR:ragas.executor:Exception raised in Job[56]: Asserti

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,ranintnagapranintrrcnextprocedureintersystem l...,uplink ran configuration transfer.,1.051835e-231,0.0,0.0,0.0
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,advanced v2x services.,4.234402e-155,0.166667,0.0,0.0
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode shifts data to...,splits the traffic of a data flow across 3gpp ...,4.927225e-155,0.1875,0.0,0.0
3,What is the main objective of intent driven ma...,the main objective of intent driven management...,to reduce the complexity of management for net...,1.3319600000000002e-231,0.275862,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],mint stands for mobility internet services ter...,minimization of service interruption.,,0.046512,0.0,0.0
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,3.991752e-155,0.115385,0.0,0.0
6,What is the purpose of load-balancing steering...,loadbalancing steering mode enhancements impro...,to enable the ue and upf to freely select spli...,7.013716e-232,0.0,0.0,0.0
7,What is a capability added in the V2X Applicat...,the v2x application enabler vae adds network c...,v2x service discovery.,1.008094e-231,0.117647,0.0,0.0
8,What is the purpose of the Edge Data Network (...,the purpose of the edge data network edn is to...,to host the edge application servers and edge ...,3.9129e-155,0.186047,0.0,0.0
9,What are the three features specified in TS 23...,the three features specified in ts 23304 for 5...,broadcast mode groupcast mode unicast mode.,7.919883999999999e-232,0.0,0.0,0.0


In [40]:
score

{'bleu_score': 0.0000, 'rouge_score': 0.1196, 'exact_match': 0.0000, 'string_present': 0.0000}

In [41]:
llama_3_2_evaluation_RAGAS_no_LLM = score.to_pandas()
# llama_3_2_evaluation_RAGAS_no_LLM.to_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_no_LLM_correct_option.csv", index=False)

In [42]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/llama_3_2_lora_evaluation_RAGAS_no_LLM_correct_option.csv")

In [43]:
result

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,ranintnagapranintrrcnextprocedureintersystem l...,uplink ran configuration transfer.,1.051835e-231,0.0,0.0,0.0
1,What is covered by enhanced application layer ...,enhanced application layer support for v2x ser...,advanced v2x services.,4.234402e-155,0.166667,0.0,0.0
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode shifts data to...,splits the traffic of a data flow across 3gpp ...,4.927225e-155,0.1875,0.0,0.0
3,What is the main objective of intent driven ma...,the main objective of intent driven management...,to reduce the complexity of management for net...,1.3319600000000002e-231,0.275862,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],mint stands for mobility internet services ter...,minimization of service interruption.,,0.046512,0.0,0.0
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,3.991752e-155,0.115385,0.0,0.0
6,What is the purpose of load-balancing steering...,loadbalancing steering mode enhancements impro...,to enable the ue and upf to freely select spli...,7.013716e-232,0.0,0.0,0.0
7,What is a capability added in the V2X Applicat...,the v2x application enabler vae adds network c...,v2x service discovery.,1.008094e-231,0.117647,0.0,0.0
8,What is the purpose of the Edge Data Network (...,the purpose of the edge data network edn is to...,to host the edge application servers and edge ...,3.9129e-155,0.186047,0.0,0.0
9,What are the three features specified in TS 23...,the three features specified in ts 23304 for 5...,broadcast mode groupcast mode unicast mode.,7.919883999999999e-232,0.0,0.0,0.0
