# Choose Model gpt-o4-mini

In [1]:
import torch
print("CUDA Available: ", torch.cuda.is_available())
print("CUDA Device Name: ", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

# Verificar se CUDA está disponível para acelerar o processamento
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

CUDA Available:  True
CUDA Device Name:  NVIDIA GeForce RTX 3050 Ti Laptop GPU
Usando dispositivo: cuda


## Gpt-4o-mini

In [2]:
from openai import OpenAI

In [4]:
# completion = client.chat.completions.create(
#     model="gpt-4o-mini",
#     messages=[
#         {"role": "user", "content": "hello?"}
#     ]
# )


In [13]:
# # Test gpt-4o-mini
# response = completion.choices[0].message.content
# print(response)

Hello! How can I assist you today?


# Dataset TeleQnA for Inference

In [3]:
import json

# Path to the TeleQnA processed question in JSON file
rel17_100_questions_path = r"../Files/rel17_100_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_100_questions_path, "r", encoding="utf-8") as file:
    rel17_100_questions = json.load(file)
print(len(rel17_100_questions))

100


In [4]:
rel17_100_questions[0]

{'question': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'option 1': 'eNB Configuration Transfer',
 'option 2': 'Downlink RAN Configuration Transfer',
 'option 3': 'Uplink RAN Configuration Transfer',
 'option 4': 'MME Configuration Transfer',
 'answer': 'option 3: Uplink RAN Configuration Transfer',
 'explanation': 'The NGAP procedure used for inter-system load balancing is Uplink RAN Configuration Transfer.',
 'category': 'Standards overview'}

# Import RAG Functions

In [7]:
from rag_functions import load_faiss_index, search_faiss_index, search_RAG, load_chunks

  from tqdm.autonotebook import tqdm, trange
2024-11-06 14:58:07.523001: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 14:58:07.659232: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 14:58:07.716824: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 14:58:07.734456: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 14:58:07.844357

In [8]:
# Test RAG
query_text = "reception of a transparent l3 message in unacknowledged mode"
index_file_path = "../Files/faiss_index.bin"
chunks_path = "../Files/tspec_chunks_markdown.pkl"
top_k = 5

In [5]:
result = search_RAG(query_text, index_file_path, chunks_path, top_k)
print(result)

Information 1:
BSC.  
Collision cases are treated as specified in 3GPPTS44.006.  
If BTS has repeated the DISC frame N200 times, BTS sends a RELease
INDication and an ERRor INDication message to BSC (cf. 3GPPTS44.006).  
![](media/image7.png){width="3.65625in" height="1.2083333333333333in"}  
3.5 Transmission of a transparent L3-Message in acknowledged mode
-----------------------------------------------------------------  
This procedure is used by BSC to request the sending of a L3 message to
MS in acknowledged mode.  
BSC sends a DATA REQuest message to BTS. The message contains the
complete L3 message to be sent in acknowledged mode.  
![](media/image8.png){width="3.6979166666666665in" height="1.0625in"}  
3.6 Reception of a transparent L3-Message in acknowledged mode
--------------------------------------------------------------  
This procedure is used by BTS to indicate the reception of a L3 message
in acknowledged mode.  
BTS sends a DATA INDication message to BSC. The message 

# Accuracy Evaluation

## Create prompt and Ask function for Llama 3.2 with no Fine-Tuning

In [15]:

def ask_gpt4_RAG(question_data, top_k=5, index_file_path="../Files/faiss_index.bin", chunks_path="../Files/tspec_chunks_markdown.pkl"):
    """
    Function to generate an answer using the GPT-4o-mini model based on the given question and options.

    Parameters:
    - question_data: Dictionary containing the question and options.
    - top_k: Number of relevant chunks to retrieve from the search.
    - index_file_path: Path to the FAISS index file.
    - chunks_path: Path to the chunks file.

    Returns:
    - String: Model's generated response.
    """
    # Initialize the OpenAI client
    client = OpenAI()

    # Extract question and options
    question = question_data['question']
    options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]
    
    question_search = (
        f"{question}\n" +
        " ".join(options) + " "
    )
    

    # Perform RAG search using the question to retrieve relevant information
    rag_results = search_RAG(question_search, index_file_path=index_file_path, chunks_path=chunks_path, top_k=top_k)

    prompt = (
        f"Relevant Information:\n{rag_results}\n"
        f"Question: {question}\n"
        f"Options:\n" + "\n".join(options) + "\n"
        # "Think step by step and analyze the relevant information carefully, then choose the correct option.\n"
        # "Think step by step and choose the correct option. Analyse the Relevant Information.\n"
        "Think step by step and choose the correct option.\n"
        "You must respond in the format 'correct option: <X>', where <X> is the correct letter for the option."
    )
    
    # print(prompt)

    # Generate the response using GPT-4o-mini
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,  # Controls randomness
        max_tokens=512,   # Limits the response length
        top_p=0.9,        # Nucleus sampling
        frequency_penalty=0,  # Prevents word repetition
        presence_penalty=0  # Encourages variety in output
    )

    # Extract and return the generated response
    response = completion.choices[0].message.content.strip()
    return response

In [14]:
question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

gpt4_response = ask_gpt4_RAG(question_data)
print(gpt4_response)

Relevant Information:
Information 1:
of OFDM symbols of the PUSCH, including all OFDM symbols used for DMRS;  
\- for any OFDM symbol that carries DMRS of the PUSCH,
$M_{\text{sc}}^{\text{UCI}}\left( l \right) = 0$;  
\- for any OFDM symbol that does not carry DMRS of the PUSCH,
$M_{\text{sc}}^{\text{UCI}}\left( l \right) = M_{\text{sc}}^{\text{PUSCH}} - \ M_{\text{sc}}^{PT - RS}\left( l \right)$;  
\- $\alpha$ is configured by higher layer parameter *scaling*;  
\- $l_{0}$ is the symbol index of the first OFDM symbol that does not
carry DMRS of the PUSCH, after the first DMRS symbol(s), in the PUSCH
transmission.  
For CG-UCI transmission on PUSCH with UL-SCH, and if
*numberOfSlotsTBoMS* is present in the resource allocation table and the
value of *numberOfSlotsTBoMS* in the row indicated by the Time domain
resource assignment field in DCI is larger than 1, the number of coded
modulation symbols per layer for CG-UCI transmission, denoted as
$Q_{CG - UCI}^{'}$, is determined as follows

## Create Function to Evaluate Question 

In [16]:
import re

def extract_option(answer):
    """
    Extract the option part from the answer string, removing all punctuation and converting to lowercase.
    
    Parameters:
    - answer: A string containing the answer in the format 'option X: ...'.

    Returns:
    - String: Extracted option (e.g., 'option 2'), or None if no match is found.
    """
    # Remove all punctuation and convert to lowercase
    cleaned_answer = re.sub(r'[^\w\s]', '', answer.lower())
   # Find all matches for the format "option X"
    matches = re.findall(r'option \d+', cleaned_answer)
    # Return the last match with stripped whitespace if any found, otherwise None
    return matches[-1].strip() if matches else None

In [17]:
def evaluate_model_response(model_response, question_data):
    """
    Compare the model's response with the correct answer from the question data.
    
    Parameters:
    - model_response: The response string generated by the model.
    - question_data: Dictionary containing the question, options, and the correct answer.

    Returns:
    - 1 if the response is correct, otherwise the extracted model option.
    """
    correct_option = extract_option(question_data['answer'])  # Extract correct option
    model_option = extract_option(model_response)  # Extract model's option
    # print(model_option, correct_option)

    return 1 if model_option == correct_option else model_option  # Return 1 if correct, else model's option


In [18]:
question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

In [41]:
evaluation_result = evaluate_model_response(gpt4_response, question_data)
print(evaluation_result)

1


## Ask to model gpt-4o-mini TeleQnA 100 question 

In [19]:
def gpt4_evaluate_questions(questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_gpt4_RAG(question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['answer'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [20]:
# Process all questions and get responses
gpt4_responses = gpt4_evaluate_questions(rel17_100_questions)

Responded 1 of 100 questions...
Responded 2 of 100 questions...
Responded 3 of 100 questions...
Responded 4 of 100 questions...
Responded 5 of 100 questions...
Responded 6 of 100 questions...
Responded 7 of 100 questions...
Responded 8 of 100 questions...
Responded 9 of 100 questions...
Responded 10 of 100 questions...
Responded 11 of 100 questions...
Responded 12 of 100 questions...
Responded 13 of 100 questions...
Responded 14 of 100 questions...
Responded 15 of 100 questions...
Responded 16 of 100 questions...
Responded 17 of 100 questions...
Responded 18 of 100 questions...
Responded 19 of 100 questions...
Responded 20 of 100 questions...
Responded 21 of 100 questions...
Responded 22 of 100 questions...
Responded 23 of 100 questions...
Responded 24 of 100 questions...
Responded 25 of 100 questions...
Responded 26 of 100 questions...
Responded 27 of 100 questions...
Responded 28 of 100 questions...
Responded 29 of 100 questions...
Responded 30 of 100 questions...
Responded 31 of 100

In [25]:
print(rel17_100_questions[1]['question'])
print(rel17_100_questions[1]['answer'])
print(gpt4_responses[1]['response'])

What is covered by enhanced application layer support for V2X services? [3GPP Release 17]
option 2: Advanced V2X services
To determine what is covered by enhanced application layer support for V2X services in 3GPP Release 17, let's analyze the provided information.

1. **Key Issues and Solutions**: Information 1 highlights several key issues and corresponding solutions, focusing on enhancements to V2X group management, support for HD maps, and application layer functionalities.

2. **Capabilities Enhanced**: Information 2 specifies enhancements in the VAE (V2X Application Enabler) layer, including service API exposure, file distribution, and network monitoring, which are aimed at improving the overall functionality of V2X services.

3. **Requirements**: Information 3 outlines the requirement for the V2X application enabler layer to support application QoS requirements, which is critical for V2X services.

4. **Architectural Enhancements**: Information 4 mentions the study's focus on ex

## Save accuracy responses

In [4]:
def save_responses_to_json(responses, filename):
    """
    Save the model responses to a JSON file.
    
    Parameters:
    - responses: List of responses to save.
    - filename: Name of the JSON file.
    """
    
    with open(filename, "w") as json_file:
        json.dump(responses, json_file, indent=4)

In [28]:
# save_responses_to_json(gpt4_responses,"../Models_responses/Accuracy/gpt4_RAG_responses.json")

## Evaluate responses from Llama 3.2

In [35]:
# Path to the TeleQnA processed question in JSON file
gpt4_responses_path = r"../Models_responses/Accuracy/gpt4_RAG_responses.json"

# Load the TeleQnA data just release 17
with open(gpt4_responses_path, "r", encoding="utf-8") as file:
    gpt4_responses = json.load(file)
print(len(gpt4_responses))

100


In [36]:
def evaluate_accuracy(model_responses):
    """
    Evaluate the model's responses and calculate accuracy.
    """
    correct_count = 0  # Track the number of correct responses
    none_count = 0  # Track the number of 'None' responses

    for index, question_data in enumerate(model_responses):
        evaluation_result = evaluate_model_response(question_data['response'], question_data)
        options = [f"{key}: {value}" for key, value in rel17_100_questions[index].items() if 'option' in key]

        if evaluation_result == 1:
            correct_count += 1  # Increment for correct response
        elif evaluation_result is None:
            # Print only responses that are None
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Full model response:\n{question_data['response']}")
            print(f"Correct response: {question_data['answer']}")
            print("----------------------------------------------------------------------------------------")
            none_count += 1  # Increment for None response
        else:
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Model response: {evaluation_result}")
            print(f"Correct response: {question_data['answer']}")
            print("----------------------------------------------------------------------------------------")

    # Calculate and print accuracy
    accuracy = correct_count / len(model_responses) * 100
    print(f"\nAccuracy: {accuracy:.2f}%")
    print(f"Total 'None' responses: {none_count}")
    print(f"'None' responses means that the model did not give an option")


In [37]:
evaluate_accuracy(gpt4_responses)


Wrong Answer
Question 2: What is covered by enhanced application layer support for V2X services? [3GPP Release 17]
Options:
option 1: PC5 radio resource control
option 2: Advanced V2X services
option 3: SDAP layer enhancements
option 4: V2X communication over NR PC5 reference point
option 5: Tele-Operated Driving

Model response: option 5
Correct response: option 2: Advanced V2X services
----------------------------------------------------------------------------------------

Wrong Answer
Question 5: What does MINT stand for? [3GPP Release 17]
Options:
option 1: Maximum Internet Network Transmission
option 2: Minimization of Interruption Network Technology
option 3: Maximization of Internet Network Traffic
option 4: Minimization of Service Interruption
option 5: Maximum Internet Network Technology

Model response: option 2
Correct response: option 4: Minimization of Service Interruption
----------------------------------------------------------------------------------------

Wrong Ans

# RAGAS evaluation

## Create prompt with no option and Ask function for Llama 3.2 with no Fine-Tuning

In [38]:
def ask_gpt4_RAG_no_options(question_data):
    """
    Function to generate an answer using the GPT-4o-mini model based on the given question.

    Parameters:
    - question_data: Dictionary containing the question and options.

    Returns:
    - String: Model's generated response.
    """
    # Initialize the OpenAI client
    client = OpenAI()

    # Extract question and options
    question = question_data['question']
    # options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]

    question_search = (
        f"{question}\n"
        # + " ".join(options) + " "
    )

    # Perform RAG search using the question to retrieve relevant information
    rag_results = search_RAG(question_search, index_file_path=index_file_path, chunks_path=chunks_path, top_k=top_k)

    # Create the prompt with the question and options
    prompt = (
        f"Relevant Information:\n{rag_results}\n"
        f"Question: {question}\n"
        # "Think step by step before answering, analyse and respond with a final answer in the format 'answer: <XXXXX>'."
        "Think step by step before answering and analyze the relevant information carefully, then respond with a final answer in the format 'answer: <XXXXX>'."
    )
    
    # print(prompt)

    # Generate the response using GPT-4o-mini
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,  # Controls randomness
        max_tokens=512,   # Limits the response length
        top_p=0.9,        # Nucleus sampling
        frequency_penalty=0,  # Prevents word repetition
        presence_penalty=0  # Encourages variety in output
    )

    # Extract and return the generated response
    response = completion.choices[0].message.content.strip()
    return response

In [37]:
question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

gpt4_response_text = ask_gpt4_RAG_no_options(question_data)
print(gpt4_response_text)

Relevant Information:
Information 1:
| UE Category                                 |      | 11-12    |   |
+---------------------------------------------+------+----------+---+
| UE DL Category                              |      | ≥ 11     |   |
+---------------------------------------------+------+----------+---+
| Note 1: 2 symbols allocated to PDCCH for 20 |      |          |   |
| MHz, 15 MHz and 10 MHz channel BW; 3        |      |          |   |
| symbols allocated to PDCCH for 5 MHz and 3  |      |          |   |
| MHz; 4 symbols allocated to PDCCH for 1.4   |      |          |   |
| MHz. For subframe 1&6, only 2 OFDM symbols  |      |          |   |
| are allocated to PDCCH. For 256QAM          |      |          |   |
| reference channel 1 symbol is allocated.    |      |          |   |
|                                             |      |          |   |
| Note 2: Reference signal, synchronization   |      |          |   |
| signals and PBCH allocated as per TS 36.211 |      

In [10]:
def format_answer(answer):
    # Remove punctuation and convert to lowercase
    answer_no_punctuation = answer.translate(str.maketrans('', '', string.punctuation))
    return answer_no_punctuation.lower()

In [11]:
import re
import string

def extract_answer(response):
    """
    Extracts the answer from the model's response if it contains 'answer:'.
    If 'answer:' is not present, returns the entire response.

    Parameters:
    - response: String containing the model's generated response.

    Returns:
    - String: Formatted extracted answer or the full response formatted.
    """
    keyword = "answer:"

    # Check if the keyword exists in the response
    if keyword in response.lower():
        # Extract everything after 'answer:'
        extracted = response.lower().rsplit(keyword, 1)[1].strip()
    else:
        # Use the full response if 'answer:' is not found
        extracted = response.strip()

    # Format the extracted answer
    return extracted

In [12]:
def extract_option(text):
    # Find all occurrences of 'option X:' followed by text, where X can be any number
    option_matches = re.findall(r'option\s*\d+:\s*(.*)', text, re.IGNORECASE | re.DOTALL)
    
    # Return the text after the last 'option X:' found
    return option_matches[-1].strip() if option_matches else None


In [13]:
extracted_answer = extract_answer(gpt4_response_text)
print(extracted_answer)

pdcch


In [14]:
model_answer = format_answer(extracted_answer)
print(extracted_answer)

pdcch


In [15]:
question_data['answer']

'option 2: PCFICH'

In [16]:
correct_answer = format_answer(extract_option(question_data['answer']))
print(correct_answer)

pcfich


## Model Groq for RAGAS evaluation

In [17]:
import os

if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

In [18]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    # model="llama-3.1-70b-versatile",
    model="llama3-70b-8192",
    # model="llama3-groq-70b-8192-tool-use-preview",
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [42]:
# from langchain_ollama import ChatOllama

# llm = ChatOllama(
#     model = "llama3.1",
#     temperature = 0.8,
#     num_predict = 256,
#     # other params ...
# )

In [11]:
llm.invoke("Hello")

AIMessage(content='Hello. How can I assist you today?', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 36, 'total_tokens': 46, 'completion_time': 0.04, 'prompt_time': 0.009800944, 'queue_time': 0.005168585999999999, 'total_time': 0.049800944}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_b3ae7e594e', 'finish_reason': 'stop', 'logprobs': None}, id='run-78d21cf9-110d-4491-9477-6f4d0302e143-0', usage_metadata={'input_tokens': 36, 'output_tokens': 10, 'total_tokens': 46})

In [19]:
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


## Ask to model Llama 3.2 TeleQnA 100 question with no options

In [52]:
def evaluate_questions_no_options(questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_gpt4_RAG_no_options(question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['answer'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [53]:
# Process all questions and get responses
gpt4_responses_RAGAS = evaluate_questions_no_options(rel17_100_questions[:20])

Responded 1 of 20 questions...
Responded 2 of 20 questions...
Responded 3 of 20 questions...
Responded 4 of 20 questions...
Responded 5 of 20 questions...
Responded 6 of 20 questions...
Responded 7 of 20 questions...
Responded 8 of 20 questions...
Responded 9 of 20 questions...
Responded 10 of 20 questions...
Responded 11 of 20 questions...
Responded 12 of 20 questions...
Responded 13 of 20 questions...
Responded 14 of 20 questions...
Responded 15 of 20 questions...
Responded 16 of 20 questions...
Responded 17 of 20 questions...
Responded 18 of 20 questions...
Responded 19 of 20 questions...
Responded 20 of 20 questions...


In [57]:
gpt4_responses_RAGAS[0]['response']

'To determine which NGAP procedure is used for inter-system load balancing as per 3GPP Release 17, we need to consider the relevant information provided.\n\n1. Information regarding the 3GPP AAA server behavior indicates that it reports its load using Load AVPs and factors such as traffic over various interfaces and internal resource usage.\n2. Trusted non-3GPP access network behavior also mentions using Load AVPs to select Diameter agents based on load values.\n3. The S6b interface mentions that the Diameter load control mechanism is optional and involves the 3GPP AAA server and the PDN-GW (Packet Data Network Gateway).\n4. Information 4 and 5 mention the Load Control Information in the context of GTP-C signaling, indicating that it plays a role in load balancing across networks.\n\nGiven these insights, the NGAP procedure used for inter-system load balancing would likely involve the signaling mechanisms that utilize Load Control Information to manage the traffic across different syst

In [58]:
print(gpt4_responses_RAGAS[0]['question'])
print(extract_answer(gpt4_responses_RAGAS[0]['response']))
print(gpt4_responses_RAGAS[0]['answer'])

Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]
load control information in ngap signaling.
option 3: Uplink RAN Configuration Transfer


In [59]:
# save_responses_to_json(gpt4_responses_RAGAS,"../Models_responses/RAGAS/gpt4_RAG_responses_RAGAS_20_questions.json")

## Build Dataset for Evaluation with RAGAS

In [60]:
# Path to the TeleQnA processed question in JSON file
gpt4_responses_RAGAS_path = r"../Models_responses/RAGAS/gpt4_RAG_responses_RAGAS_20_questions.json"

# Load the TeleQnA data just release 17
with open(gpt4_responses_RAGAS_path, "r", encoding="utf-8") as file:
    gpt4_responses_RAGAS = json.load(file)
print(len(gpt4_responses_RAGAS))

20


In [61]:
from datasets import Dataset 

In [62]:
def transform_dataset(data):
    """Transform the dataset to the required format."""
    transformed_data = {
        'user_input': [],
        'response': [],
        'reference': []
    }

    for item in data:
        # print(f"\n{item['question']}\n{item['answer']}\n{item['response']}")
        question = item['question']
        model_response = format_answer(extract_answer(item['response']))
        correct_answer = format_answer(extract_option(item['answer']))
        # model_response = (extract_answer(item['response']))
        # correct_answer = (item['answer'])
        
        # Ensure model_response and correct_answer end with a period
        model_response = model_response.rstrip('.') + '.'
        correct_answer = correct_answer.rstrip('.') + '.'

        transformed_data['user_input'].append(question)
        transformed_data['response'].append(model_response)
        transformed_data['reference'].append(correct_answer)

    return transformed_data

In [63]:
# Transform the responses  dataset
data_samples = transform_dataset(gpt4_responses_RAGAS[:20])
# data_samples = transform_dataset(gpt4_responses_RAGAS)

# Create the dataset object
dataset = Dataset.from_dict(data_samples)

# Print to verify the structure
print(dataset)

Dataset({
    features: ['user_input', 'response', 'reference'],
    num_rows: 20
})


In [65]:
dataset[0]

{'user_input': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'response': 'load control information in ngap signaling.',
 'reference': 'uplink ran configuration transfer.'}

## Evaluate Llama 3.2 with RAGAS Metrics - Comparing with 'Correct Option Text' in rel17_100_questions

### Using LLM to evaluate (Factual Correctness, Semantic similarity and Rubrics based criteria scoring)

In [66]:
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.metrics import RubricsScoreWithReference

In [67]:
factualCorrectness = FactualCorrectness()
semantiSimilarity = SemanticSimilarity()
rubrics = {
    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}
rubricsScoreWithReference =  RubricsScoreWithReference(rubrics=rubrics)

In [85]:
score = evaluate(
    dataset,
    metrics=[
        factualCorrectness,
        semantiSimilarity,
        rubricsScoreWithReference,
    ],
    llm=llm,
    embeddings=embeddings,
    run_config = RunConfig(timeout=600, max_retries=20, max_wait=180,log_tenacity=False),
)
score.to_pandas()

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[6]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[57]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[39]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[9]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[42]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[27]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[45]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[30]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[15]: TimeoutError()


Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,load control information in ngap signaling.,uplink ran configuration transfer.,0.0,0.169793,2
1,What is covered by enhanced application layer ...,to determine what is covered by enhanced appli...,advanced v2x services.,0.0,0.616583,5
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode distributes tr...,splits the traffic of a data flow across 3gpp ...,,0.589284,4
3,What is the main objective of intent driven ma...,the main objective is to enable consumers to e...,to reduce the complexity of management for net...,,0.478847,2
4,What does MINT stand for? [3GPP Release 17],minimization of service interruption.,minimization of service interruption.,1.0,1.0,5
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,,0.624823,3
6,What is the purpose of load-balancing steering...,the purpose of loadbalancing steering mode enh...,to enable the ue and upf to freely select spli...,0.29,0.37095,3
7,What is a capability added in the V2X Applicat...,support for v2p applications through dedicated...,v2x service discovery.,0.0,0.516248,2
8,What is the purpose of the Edge Data Network (...,the edge data network edn enables edge applica...,to host the edge application servers and edge ...,0.29,0.688947,3
9,What are the three features specified in TS 23...,support for broadcast groupcast and unicast mo...,broadcast mode groupcast mode unicast mode.,,0.674552,2


In [92]:
score

{'factual_correctness': 0.2864, 'semantic_similarity': 0.4951, 'rubrics_score_with_reference': 2.7000}

In [93]:
gpt4_evaluation_RAGAS_LLM = score.to_pandas()
# gpt4_evaluation_RAGAS_LLM.to_csv("../Evaluations/RAGAS/gpt4_RAG_evaluation_RAGAS_LLM_20_questions.csv", index=False)

In [94]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/gpt4_RAG_evaluation_RAGAS_LLM_20_questions.csv")

In [95]:
result

Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,load control information in ngap signaling.,uplink ran configuration transfer.,0.0,0.169793,2
1,What is covered by enhanced application layer ...,to determine what is covered by enhanced appli...,advanced v2x services.,0.0,0.616583,5
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode distributes tr...,splits the traffic of a data flow across 3gpp ...,,0.589284,4
3,What is the main objective of intent driven ma...,the main objective is to enable consumers to e...,to reduce the complexity of management for net...,,0.478847,2
4,What does MINT stand for? [3GPP Release 17],minimization of service interruption.,minimization of service interruption.,1.0,1.0,5
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,,0.624823,3
6,What is the purpose of load-balancing steering...,the purpose of loadbalancing steering mode enh...,to enable the ue and upf to freely select spli...,0.29,0.37095,3
7,What is a capability added in the V2X Applicat...,support for v2p applications through dedicated...,v2x service discovery.,0.0,0.516248,2
8,What is the purpose of the Edge Data Network (...,the edge data network edn enables edge applica...,to host the edge application servers and edge ...,0.29,0.688947,3
9,What are the three features specified in TS 23...,support for broadcast groupcast and unicast mo...,broadcast mode groupcast mode unicast mode.,,0.674552,2


### No need LLM to evaluate (BleuScore, RougeScore, ExactMatch and StringPresence)

In [73]:
from ragas.metrics import BleuScore, RougeScore, ExactMatch, StringPresence
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arimatea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [74]:
bleuScore = BleuScore()
rougeScore = RougeScore()
exactMatch = ExactMatch()
stringPresence = StringPresence()

In [75]:
score = evaluate(
    dataset,
    metrics=[
        bleuScore,
        rougeScore,
        exactMatch,
        stringPresence
    ],
    llm=llm,
    embeddings=embeddings
)
score.to_pandas()

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
ERROR:ragas.executor:Exception raised in Job[4]: AssertionError(The number of hypotheses and their reference(s) should be the same )


Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,load control information in ngap signaling.,uplink ran configuration transfer.,1.1200410000000001e-231,0.0,0.0,0.0
1,What is covered by enhanced application layer ...,to determine what is covered by enhanced appli...,advanced v2x services.,,0.010811,0.0,0.0
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode distributes tr...,splits the traffic of a data flow across 3gpp ...,0.1733706,0.411765,0.0,0.0
3,What is the main objective of intent driven ma...,the main objective is to enable consumers to e...,to reduce the complexity of management for net...,1.2627080000000001e-231,0.176471,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],minimization of service interruption.,minimization of service interruption.,1.0,1.0,1.0,1.0
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,3.766504e-155,0.148148,0.0,0.0
6,What is the purpose of load-balancing steering...,the purpose of loadbalancing steering mode enh...,to enable the ue and upf to freely select spli...,0.09930868,0.301887,0.0,0.0
7,What is a capability added in the V2X Applicat...,support for v2p applications through dedicated...,v2x service discovery.,9.788429e-232,0.0,0.0,0.0
8,What is the purpose of the Edge Data Network (...,the edge data network edn enables edge applica...,to host the edge application servers and edge ...,4.159893e-155,0.190476,0.0,0.0
9,What are the three features specified in TS 23...,support for broadcast groupcast and unicast mo...,broadcast mode groupcast mode unicast mode.,1.2340560000000001e-231,0.333333,0.0,0.0


In [76]:
score

{'bleu_score': 0.0779, 'rouge_score': 0.2181, 'exact_match': 0.0500, 'string_present': 0.0500}

In [77]:
gpt4_evaluation_RAGAS_no_LLM = score.to_pandas()
# gpt4_evaluation_RAGAS_no_LLM.to_csv("../Evaluations/RAGAS/gpt4_RAG_evaluation_RAGAS_no_LLM_20_questions.csv", index=False)

In [78]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/gpt4_RAG_evaluation_RAGAS_no_LLM_20_questions.csv")

In [79]:
result

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,load control information in ngap signaling.,uplink ran configuration transfer.,1.1200410000000001e-231,0.0,0.0,0.0
1,What is covered by enhanced application layer ...,to determine what is covered by enhanced appli...,advanced v2x services.,,0.010811,0.0,0.0
2,What does the Load-Balancing steering mode do?...,the loadbalancing steering mode distributes tr...,splits the traffic of a data flow across 3gpp ...,0.1733706,0.411765,0.0,0.0
3,What is the main objective of intent driven ma...,the main objective is to enable consumers to e...,to reduce the complexity of management for net...,1.2627080000000001e-231,0.176471,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],minimization of service interruption.,minimization of service interruption.,1.0,1.0,1.0,1.0
5,What is the purpose of the Media Streaming AF ...,the purpose of the media streaming af event ex...,to support data collection and event exposure ...,3.766504e-155,0.148148,0.0,0.0
6,What is the purpose of load-balancing steering...,the purpose of loadbalancing steering mode enh...,to enable the ue and upf to freely select spli...,0.09930868,0.301887,0.0,0.0
7,What is a capability added in the V2X Applicat...,support for v2p applications through dedicated...,v2x service discovery.,9.788429e-232,0.0,0.0,0.0
8,What is the purpose of the Edge Data Network (...,the edge data network edn enables edge applica...,to host the edge application servers and edge ...,4.159893e-155,0.190476,0.0,0.0
9,What are the three features specified in TS 23...,support for broadcast groupcast and unicast mo...,broadcast mode groupcast mode unicast mode.,1.2340560000000001e-231,0.333333,0.0,0.0


##