# Choose Model Llama 3.2 with no Fine-Tuning

In [1]:
import torch
print("CUDA Available: ", torch.cuda.is_available())
print("CUDA Device Name: ", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

# Verificar se CUDA está disponível para acelerar o processamento
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

CUDA Available:  True
CUDA Device Name:  NVIDIA GeForce RTX 3050 Ti Laptop GPU
Usando dispositivo: cuda


## Llama 3.2 Standart

In [2]:
from unsloth import FastLanguageModel
import torch

2024-10-30 07:33:43.786345: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-30 07:33:43.918733: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-30 07:33:43.977752: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-30 07:33:43.995303: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-30 07:33:44.095336: I tensorflow/core/platform/cpu_feature_guar

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name="unsloth/Llama-3.2-3B-bnb-4bit",
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # device_map="auto"
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.10.6: Fast Llama patching. Transformers = 4.46.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.712 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.6. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


# Dataset TeleQnA for Inference

In [5]:
import json

# Path to the TeleQnA processed question in JSON file
rel17_100_questions_path = r"../Files/rel17_100_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_100_questions_path, "r", encoding="utf-8") as file:
    rel17_100_questions = json.load(file)
print(len(rel17_100_questions))

100


In [6]:
rel17_100_questions[0]

{'question': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'option 1': 'eNB Configuration Transfer',
 'option 2': 'Downlink RAN Configuration Transfer',
 'option 3': 'Uplink RAN Configuration Transfer',
 'option 4': 'MME Configuration Transfer',
 'answer': 'option 3: Uplink RAN Configuration Transfer',
 'explanation': 'The NGAP procedure used for inter-system load balancing is Uplink RAN Configuration Transfer.',
 'category': 'Standards overview'}

# Accuracy Evaluation

## Create prompt and Ask function for Llama 3.2 with no Fine-Tuning

In [10]:
from unsloth.chat_templates import get_chat_template

def ask_llama_3_2(model, tokenizer, question_data):
    """
    Function to generate an answer using the model based on the given question and options.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - question_data: Dictionary containing the question and options.

    Returns:
    - String: Model's generated response.
    """

    # Extract question and options
    question = question_data['question']
    options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]

    # Create the prompt with the question and options
    prompt = (
        f"Question: {question}\n"
        f"Options:\n" + "\n".join(options) + "\n"
        "Think step by step before answering and respond with the correct option in the format 'correct option: <X>'."
    )

    # Create the input for the model
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate the response
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=2048,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )

    # Decode and return the model's output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return response


In [11]:
# Example usage
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

llama_3_2_response = ask_llama_3_2(model, tokenizer, question_data)
print(llama_3_2_response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Question: Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]
Options:
option 1: PBCH
option 2: PCFICH
option 3: PDSCH
option 4: PHICH
Think step by step before answering and respond with the correct option in the format 'correct option: <X>'.assistant

correct option: option 2


## Create Function to Evaluate Question 

In [2]:
import re

def extract_option(answer):
    """
    Extract the option part from the answer string, removing all punctuation and converting to lowercase.
    
    Parameters:
    - answer: A string containing the answer in the format 'option X: ...'.

    Returns:
    - String: Extracted option (e.g., 'option 2'), or None if no match is found.
    """
    # Remove all punctuation and convert to lowercase
    cleaned_answer = re.sub(r'[^\w\s]', '', answer.lower())
    # Search for the option in the format "option X"
    match = re.search(r'option \d+', cleaned_answer)
    return match.group(0).strip() if match else None

In [13]:
def extract_response_after_assistant(response):
    """
    Extract the part of the response that comes after the 'assistant' marker.

    Parameters:
    - response: The complete response from the model.

    Returns:
    - String: The extracted relevant part of the response.
    """
    # Split the response based on the 'assistant' marker
    parts = response.split('assistant', 1)
    # Return the part after 'assistant' or the entire response if 'assistant' is not found
    return parts[1].strip() if len(parts) > 1 else response.strip()

In [3]:
def evaluate_model_response(model_response, question_data):
    """
    Compare the model's response with the correct answer from the question data.
    
    Parameters:
    - model_response: The response string generated by the model.
    - question_data: Dictionary containing the question, options, and the correct answer.

    Returns:
    - 1 if the response is correct, otherwise the extracted model option.
    """
    correct_option = extract_option(question_data['answer'])  # Extract correct option
    relevant_response = extract_response_after_assistant(model_response)  # Get relevant part of response
    model_option = extract_option(relevant_response)  # Extract model's option

    return 1 if model_option == correct_option else model_option  # Return 1 if correct, else model's option


In [15]:
question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

In [16]:
evaluation_result = evaluate_model_response(llama_3_2_response, question_data)
print(evaluation_result)

1


## Ask to model Llama 3.2 TeleQnA 100 question 

In [17]:
def evaluate_questions(model, tokenizer, questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_llama_3_2(model, tokenizer, question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['answer'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [None]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
llama_3_2_responses = evaluate_questions(model, tokenizer, rel17_100_questions)

Responded 1 of 100 questions...
Responded 2 of 100 questions...
Responded 3 of 100 questions...
Responded 4 of 100 questions...
Responded 5 of 100 questions...
Responded 6 of 100 questions...
Responded 7 of 100 questions...
Responded 8 of 100 questions...
Responded 9 of 100 questions...
Responded 10 of 100 questions...
Responded 11 of 100 questions...
Responded 12 of 100 questions...
Responded 13 of 100 questions...
Responded 14 of 100 questions...
Responded 15 of 100 questions...
Responded 16 of 100 questions...
Responded 17 of 100 questions...
Responded 18 of 100 questions...
Responded 19 of 100 questions...
Responded 20 of 100 questions...
Responded 21 of 100 questions...
Responded 22 of 100 questions...
Responded 23 of 100 questions...
Responded 24 of 100 questions...
Responded 25 of 100 questions...
Responded 26 of 100 questions...
Responded 27 of 100 questions...
Responded 28 of 100 questions...
Responded 29 of 100 questions...
Responded 30 of 100 questions...
Responded 31 of 100

In [28]:
print(llama_3_2_responses[0]['response'])

system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Question: Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]
Options:
option 1: eNB Configuration Transfer
option 2: Downlink RAN Configuration Transfer
option 3: Uplink RAN Configuration Transfer
option 4: MME Configuration Transfer
Think step by step before answering and respond with the correct option in the format 'correct option: <X>'.assistant

correct option: option 2


## Save accuracy responses

In [39]:
def save_responses_to_json(responses, filename):
    """
    Save the model responses to a JSON file.
    
    Parameters:
    - responses: List of responses to save.
    - filename: Name of the JSON file.
    """
    
    with open(filename, "w") as json_file:
        json.dump(responses, json_file, indent=4)

In [19]:
# save_responses_to_json(responses_llama_3_2,"../Models_responses/Accuracy/llama_3.2_responses.json")

## Evaluate responses from Llama 3.2

In [20]:
# Path to the TeleQnA processed question in JSON file
llama_3_2_responses_path = r"../Models_responses/Accuracy/llama_3.2_responses.json"

# Load the TeleQnA data just release 17
with open(llama_3_2_responses_path, "r", encoding="utf-8") as file:
    llama_3_2_responses = json.load(file)
print(len(llama_3_2_responses))

100


In [24]:
def evaluate_accuracy(responses_llama_3_2):
    """
    Evaluate the model's responses and calculate accuracy.
    """
    correct_count = 0  # Track the number of correct responses
    none_count = 0  # Track the number of 'None' responses

    for index, question_data in enumerate(responses_llama_3_2):
        evaluation_result = evaluate_model_response(question_data['response'], question_data)
        options = [f"{key}: {value}" for key, value in rel17_100_questions[index].items() if 'option' in key]

        if evaluation_result == 1:
            correct_count += 1  # Increment for correct response
        elif evaluation_result is None:
            # Print only responses that are None
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Correct response: {question_data['answer']}")
            print(f"Full model response:\n{question_data['response']}")
            print("----------------------------------------------------------------------------------------")
            none_count += 1  # Increment for None response
        else:
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Correct response: {question_data['answer']}")
            print(f"Model response: {evaluation_result}")
            print("----------------------------------------------------------------------------------------")

    # Calculate and print accuracy
    accuracy = correct_count / len(responses_llama_3_2) * 100
    print(f"\nAccuracy: {accuracy:.2f}%")
    print(f"Total 'None' responses: {none_count}")
    print(f"'None' responses means that the model did not give an option")


In [25]:
evaluate_accuracy(llama_3_2_responses)


Wrong Answer
Question 1: Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]
Options:
option 1: eNB Configuration Transfer
option 2: Downlink RAN Configuration Transfer
option 3: Uplink RAN Configuration Transfer
option 4: MME Configuration Transfer

Correct response: option 3: Uplink RAN Configuration Transfer
Model response: option 2
----------------------------------------------------------------------------------------

Wrong Answer
Question 2: What is covered by enhanced application layer support for V2X services? [3GPP Release 17]
Options:
option 1: PC5 radio resource control
option 2: Advanced V2X services
option 3: SDAP layer enhancements
option 4: V2X communication over NR PC5 reference point
option 5: Tele-Operated Driving

Correct response: option 2: Advanced V2X services
Model response: option 1
----------------------------------------------------------------------------------------

Wrong Answer
Question 3: What does the Load-Balancing steering

# RAGAS evaluation

## Create prompt with no option and Ask function for Llama 3.2 with no Fine-Tuning

In [4]:
from unsloth.chat_templates import get_chat_template

def ask_llama_3_2_no_options(model, tokenizer, question_data):
    """
    Function to generate an answer using the model based on the given question and options.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - question_data: Dictionary containing the question and options.

    Returns:
    - String: Model's generated response.
    """

    # Extract question and options
    question = question_data['question']
    options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]

    # Create the prompt with the question and options
    prompt = (
        f"Question: {question}\n"
        "Think step by step before answering and respond with a final answer in the format 'answer: <XXXXX>'."
    )

    # Create the input for the model
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate the response
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=2048,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )

    # Decode and return the model's output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return response


2024-10-30 17:02:44.290688: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-30 17:02:44.303116: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-30 17:02:44.318755: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-30 17:02:44.323337: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-30 17:02:44.335539: I tensorflow/core/platform/cpu_feature_guar

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [8]:
# Example usage
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

llama_3_2_response_text = ask_llama_3_2_no_options(model, tokenizer, question_data)
print(llama_3_2_response_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Question: Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]
Think step by step before answering and respond with a final answer in the format 'answer: <XXXXX>'.assistant

To determine which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs, we should consider the PDCCH format and the way it interacts with the physical channels in the cell.

The Physical Downlink Control Channel (PDCCH) is a channel that carries control and feedback signals between the Evolved Node B (eNB) (the base station) and the Evolved UTRA (E-UTRA), which is now known as the User Equipment (UE). The PDCCH is a time-multiplexed channel, where different users share the same physical channel by assigning them different OFDM symbols, depending on the channel format used (e.g., 4k, 8k, 12.5k, or 16.5k). These channel formats de

In [5]:
def format_answer(answer):
    # Remove punctuation and convert to lowercase
    answer_no_punctuation = answer.translate(str.maketrans('', '', string.punctuation))
    return answer_no_punctuation.lower()

In [6]:
import re
import string

def extract_answer(text):
    # Verifica a presença de 'assistant'
    assistant_match = re.search(r'assistant\s*(.*)', text, re.IGNORECASE | re.DOTALL)
    if assistant_match:
        assistant_text = assistant_match.group(1).strip()

        # Verifica se existe 'answer:' dentro do texto após 'assistant'
        answer_match = re.search(r'answer:\s*(.*)', assistant_text, re.IGNORECASE | re.DOTALL)
        if answer_match:
            return answer_match.group(1).strip()  # Retorna tudo após 'answer:'

        # Se não encontrar 'answer:', retorna tudo após 'assistant'
        return assistant_text

    # Se não encontrar 'assistant', retorna None
    return None


In [11]:
extracted_answer = extract_answer(llama_3_2_response_text)
print(extracted_answer)

PDCCH.


In [12]:
model_response = format_answer(extracted_answer)
print(model_response)

pdcch


In [13]:
correct_answer = format_answer(question_data['explanation'])
print(correct_answer)

the physical control format indicator channel pcfich informs the ue and the rn about the number of ofdm symbols used for the pdcchs


## Model Groq for RAGAS evaluation

In [22]:
import os

if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

In [23]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    # model="llama-3.1-70b-versatile",
    # model="llama-3.2-90b-text-preview",
    model="llama3-70b-8192",
    # model="llama3-groq-70b-8192-tool-use-preview",
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [26]:
# from langchain_ollama import ChatOllama

# llm = ChatOllama(
#     model = "llama3.1",
#     temperature = 0.8,
#     num_predict = 256,
#     # other params ...
# )

In [47]:
llm.invoke("Hello")

AIMessage(content='Hello. How can I assist you today?', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 36, 'total_tokens': 46, 'completion_time': 0.04, 'prompt_time': 0.00976772, 'queue_time': 0.005032718, 'total_time': 0.04976772}, 'model_name': 'llama-3.2-90b-text-preview', 'system_fingerprint': 'fp_b3ae7e594e', 'finish_reason': 'stop', 'logprobs': None}, id='run-5ae4b547-d8cd-4ba5-973f-acb8dc10a34d-0', usage_metadata={'input_tokens': 36, 'output_tokens': 10, 'total_tokens': 46})

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


## Test Evaluate metrics with RAGAS

In [88]:
from datasets import Dataset 
from ragas import evaluate

In [104]:
data_samples = {
    'user_input': [question_data['question']],
    'response': [model_response],
    'reference' : [correct_answer],
}

# Create dataset
dataset = Dataset.from_dict(data_samples)

### Using LLM to evaluate (Factual Correctness, Semantic similarity and Rubrics based criteria scoring)

In [105]:
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.metrics import RubricsScoreWithReference

In [106]:
factualCorrectness = FactualCorrectness()
semantiSimilarity = SemanticSimilarity()
rubrics = {
    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}
rubricsScoreWithReference =  RubricsScoreWithReference(rubrics=rubrics)

In [107]:
score = evaluate(
    dataset,
    metrics=[
        factualCorrectness,
        semantiSimilarity,
        rubricsScoreWithReference,
    ],
    llm=llm,
    embeddings=embeddings
)
score.to_pandas()

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which physical channel informs the UE and the ...,the physical channel used for this purpose is ...,the physical control format indicator channel ...,0.0,0.591012,1


### No need LLM to evaluate (BleuScore, RougeScore, ExactMatch and StringPresence)

In [108]:
from ragas.metrics import BleuScore, RougeScore, ExactMatch, StringPresence
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arimatea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [109]:
bleuScore = BleuScore()
rougeScore = RougeScore()
exactMatch = ExactMatch()
stringPresence = StringPresence()

In [110]:
score = evaluate(
    dataset,
    metrics=[
        bleuScore,
        rougeScore,
        exactMatch,
        stringPresence
    ],
    llm=llm,
    embeddings=embeddings
)
score.to_pandas()

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[0]: AssertionError(The number of hypotheses and their reference(s) should be the same )


Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which physical channel informs the UE and the ...,the physical channel used for this purpose is ...,the physical control format indicator channel ...,,0.212766,0.0,0.0


## Ask to model Llama 3.2 TeleQnA 100 question with no options

In [32]:
def evaluate_questions_no_options(model, tokenizer, questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_llama_3_2_no_options(model, tokenizer, question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['explanation'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [42]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
llama_3_2_responses_RAGAS = evaluate_questions_no_options(model, tokenizer, rel17_100_questions)

Responded 1 of 100 questions...
Responded 2 of 100 questions...
Responded 3 of 100 questions...
Responded 4 of 100 questions...
Responded 5 of 100 questions...
Responded 6 of 100 questions...
Responded 7 of 100 questions...
Responded 8 of 100 questions...
Responded 9 of 100 questions...
Responded 10 of 100 questions...
Responded 11 of 100 questions...
Responded 12 of 100 questions...
Responded 13 of 100 questions...
Responded 14 of 100 questions...
Responded 15 of 100 questions...
Responded 16 of 100 questions...
Responded 17 of 100 questions...
Responded 18 of 100 questions...
Responded 19 of 100 questions...
Responded 20 of 100 questions...
Responded 21 of 100 questions...
Responded 22 of 100 questions...
Responded 23 of 100 questions...
Responded 24 of 100 questions...
Responded 25 of 100 questions...
Responded 26 of 100 questions...
Responded 27 of 100 questions...
Responded 28 of 100 questions...
Responded 29 of 100 questions...
Responded 30 of 100 questions...
Responded 31 of 100

In [36]:
print(llama_3_2_responses_RAGAS[0]['question'])
print(extract_answer(llama_3_2_responses_RAGAS[0]['response']))
print(llama_3_2_responses_RAGAS[0]['answer'])

Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]
NGAP (Next-Generation Applicability Protocol) procedure 1-13-3
The NGAP procedure used for inter-system load balancing is Uplink RAN Configuration Transfer.


In [45]:
# save_responses_to_json(llama_3_2_responses_RAGAS,"../Models_responses/RAGAS/llama_3.2_responses_RAGAS.json")

## Build Dataset for Evaluation with RAGAS

In [12]:
import json

# Path to the TeleQnA processed question in JSON file
llama_3_2_responses_RAGAS_path = r"../Models_responses/RAGAS/llama_3.2_responses_RAGAS.json"

# Load the TeleQnA data just release 17
with open(llama_3_2_responses_RAGAS_path, "r", encoding="utf-8") as file:
    llama_3_2_responses_RAGAS = json.load(file)
print(len(llama_3_2_responses_RAGAS))

100


In [13]:
from datasets import Dataset 

In [58]:
def transform_dataset(data):
    """Transform the dataset to the required format."""
    transformed_data = {
        'user_input': [],
        'response': [],
        'reference': []
    }

    for item in data:
        # print(f"\n{item['question']}\n{item['answer']}\n{item['response']}")
        question = item['question']
        model_response = format_answer(extract_answer(item['response']))
        correct_answer = format_answer(item['answer'])
        # model_response = (extract_answer(item['response']))
        # correct_answer = (item['answer'])

        transformed_data['user_input'].append(question)
        transformed_data['response'].append(model_response)
        transformed_data['reference'].append(correct_answer)

    return transformed_data

In [59]:
# Transform the llama_3_2_responses_RAGAS dataset
# data_samples = transform_dataset(llama_3_2_responses_RAGAS[:20])
data_samples = transform_dataset(llama_3_2_responses_RAGAS)

# Create the dataset object
dataset = Dataset.from_dict(data_samples)

# Print to verify the structure
print(dataset)

Dataset({
    features: ['user_input', 'response', 'reference'],
    num_rows: 100
})


In [60]:
dataset[0]

{'user_input': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'response': 'ngap nextgeneration applicability protocol procedure 1133',
 'reference': 'the ngap procedure used for intersystem load balancing is uplink ran configuration transfer'}

## Evaluate Llama 3.2 with RAGAS Metrics

### Using LLM to evaluate (Factual Correctness, Semantic similarity and Rubrics based criteria scoring)

In [27]:
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.metrics import RubricsScoreWithReference

In [28]:
factualCorrectness = FactualCorrectness()
semantiSimilarity = SemanticSimilarity()
rubrics = {
    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}
rubricsScoreWithReference =  RubricsScoreWithReference(rubrics=rubrics)

In [30]:
score = evaluate(
    dataset,
    metrics=[
        factualCorrectness,
        semantiSimilarity,
        rubricsScoreWithReference,
    ],
    llm=llm,
    embeddings=embeddings,
    run_config = RunConfig(timeout=400, max_retries=20, max_wait=120,log_tenacity=False),
)
score.to_pandas()

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[32]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[36]: TimeoutError()
Exception raised in Job[39]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[45]: TimeoutError()
Exception raised in Job[51]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[54]: TimeoutError()
Exception raised in Job[42]: TimeoutError()
Exception raised in Job[35]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[21]: TimeoutError()
Exception raised in Job[48]: TimeoutError()
Exception raised in Job[24]: TimeoutError()
Exception raised in Job[30]: TimeoutError()
Exception raised in Job[18]: TimeoutError()
Exception raised in Job[33]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[57]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[27]: TimeoutError()


Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,NGAP (Next-Generation Applicability Protocol) ...,The NGAP procedure used for inter-system load ...,0.0,0.578238,1.0
1,What is covered by enhanced application layer ...,"Data models, application layer protocols, and ...",Enhanced application layer support for V2X ser...,,0.44724,3.0
2,What does the Load-Balancing steering mode do?...,Load-Balancing steering mode dynamically direc...,The Load-Balancing steering mode splits the tr...,,0.67086,2.0
3,What is the main objective of intent driven ma...,"To answer this question, we need to understand...",The intent driven management aims to reduce th...,,0.79454,
4,What does MINT stand for? [3GPP Release 17],"To answer this question, we need to consider t...",MINT stands for Minimization of Service Interr...,,0.579898,2.0
5,What is the purpose of the Media Streaming AF ...,To enhance support for Multiple Subnet Masks i...,The work item relates to the support of generi...,,0.257958,1.0
6,What is the purpose of load-balancing steering...,To improve the efficiency and fairness of load...,"In Rel-17, load-balancing steering mode enhanc...",,0.648568,4.0
7,What is a capability added in the V2X Applicat...,Dynamic Light Information Reporting.,V2X service discovery across multiple V2X serv...,,0.096546,1.0
8,What is the purpose of the Edge Data Network (...,The Edge Data Network (EDN) in enabling edge a...,The Edge Data Network (EDN) hosts the Edge App...,,0.705318,2.0
9,What are the three features specified in TS 23...,"Cooperative Scheduling (Co-Sched), Dynamic Res...",The three features specified in TS 23.304 for ...,,0.316375,1.0


In [35]:
score

{'factual_correctness': 0.0000, 'semantic_similarity': 0.5535, 'rubrics_score_with_reference': 2.2941}

In [36]:
llama_3_2_evaluation_RAGAS_LLM = score.to_pandas()
llama_3_2_evaluation_RAGAS_LLM.to_csv("../Evaluations/RAGAS/llama_3_2_evaluation_RAGAS_LLM_rubricsScoreWithReference.csv", index=False)

In [39]:
import pandas as pd
# result = pd.read_csv("../Evaluations/RAGAS/llama_3_2_evaluation_RAGAS_LLM_rubricsScoreWithReference.csv")

In [40]:
result.head(20)

Unnamed: 0,user_input,response,reference,factual_correctness,semantic_similarity,rubrics_score_with_reference
0,Which NGAP procedure is used for inter-system ...,NGAP (Next-Generation Applicability Protocol) ...,The NGAP procedure used for inter-system load ...,0.0,0.578238,1.0
1,What is covered by enhanced application layer ...,"Data models, application layer protocols, and ...",Enhanced application layer support for V2X ser...,,0.44724,3.0
2,What does the Load-Balancing steering mode do?...,Load-Balancing steering mode dynamically direc...,The Load-Balancing steering mode splits the tr...,,0.67086,2.0
3,What is the main objective of intent driven ma...,"To answer this question, we need to understand...",The intent driven management aims to reduce th...,,0.79454,
4,What does MINT stand for? [3GPP Release 17],"To answer this question, we need to consider t...",MINT stands for Minimization of Service Interr...,,0.579898,2.0
5,What is the purpose of the Media Streaming AF ...,To enhance support for Multiple Subnet Masks i...,The work item relates to the support of generi...,,0.257958,1.0
6,What is the purpose of load-balancing steering...,To improve the efficiency and fairness of load...,"In Rel-17, load-balancing steering mode enhanc...",,0.648568,4.0
7,What is a capability added in the V2X Applicat...,Dynamic Light Information Reporting.,V2X service discovery across multiple V2X serv...,,0.096546,1.0
8,What is the purpose of the Edge Data Network (...,The Edge Data Network (EDN) in enabling edge a...,The Edge Data Network (EDN) hosts the Edge App...,,0.705318,2.0
9,What are the three features specified in TS 23...,"Cooperative Scheduling (Co-Sched), Dynamic Res...",The three features specified in TS 23.304 for ...,,0.316375,1.0


### No need LLM to evaluate (BleuScore, RougeScore, ExactMatch and StringPresence)

In [61]:
from ragas.metrics import BleuScore, RougeScore, ExactMatch, StringPresence
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arimatea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
bleuScore = BleuScore()
rougeScore = RougeScore()
exactMatch = ExactMatch()
stringPresence = StringPresence()

In [63]:
score = evaluate(
    dataset,
    metrics=[
        bleuScore,
        rougeScore,
        exactMatch,
        stringPresence
    ],
    llm=llm,
    embeddings=embeddings
)
score.to_pandas()

Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[316]: AssertionError(Expecting a float)
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
ERROR:ragas.executor:Exception raised in Job[236]: AssertionError(Expecting a float)
ERROR:ragas.executor:Exception raised in Job[200]: AssertionError(Expecting a float)
ERROR:ragas.executor:Exception raised in Job[232]: AssertionError(The 

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,ngap nextgeneration applicability protocol pro...,the ngap procedure used for intersystem load b...,4.310733e-232,0.210526,0.0,0.0
1,What is covered by enhanced application layer ...,data models application layer protocols and se...,enhanced application layer support for v2x ser...,2.687604e-156,0.157895,0.0,0.0
2,What does the Load-Balancing steering mode do?...,loadbalancing steering mode dynamically direct...,the loadbalancing steering mode splits the tra...,2.491127e-78,0.326531,0.0,0.0
3,What is the main objective of intent driven ma...,to answer this question we need to understand ...,the intent driven management aims to reduce th...,,0.129032,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],to answer this question we need to consider th...,mint stands for minimization of service interr...,,0.040000,0.0,0.0
...,...,...,...,...,...,...,...
95,Which RRC state is the UE in when no RRC conne...,0,when no rrc connection is established the ue i...,,0.000000,0.0,0.0
96,How are the antenna elements placed on each an...,the 3gpp release 17 doesnt explicitly state ho...,the document states that the antenna elements ...,,0.129870,0.0,0.0
97,What information may be provided to an emergen...,the lcs lawful compliance and security client ...,for emergency services the geographic location...,1.195116e-231,0.125000,0.0,0.0
98,What is the purpose of cross-network slice coo...,interoperability efficient network resource al...,crossnetwork slice coordination enables the co...,6.260628e-232,0.052632,0.0,0.0


In [64]:
score

{'bleu_score': 0.0560, 'rouge_score': 0.1735, 'exact_match': 0.0000, 'string_present': 0.0000}

In [65]:
llama_3_2_evaluation_RAGAS_no_LLM = score.to_pandas()
llama_3_2_evaluation_RAGAS_no_LLM.to_csv("../Evaluations/RAGAS/llama_3_2_evaluation_RAGAS_no_LLM.csv", index=False)

In [66]:
import pandas as pd
result = pd.read_csv("../Evaluations/RAGAS/llama_3_2_evaluation_RAGAS_no_LLM.csv")

In [67]:
result

Unnamed: 0,user_input,response,reference,bleu_score,rouge_score,exact_match,string_present
0,Which NGAP procedure is used for inter-system ...,ngap nextgeneration applicability protocol pro...,the ngap procedure used for intersystem load b...,4.310733e-232,0.210526,0.0,0.0
1,What is covered by enhanced application layer ...,data models application layer protocols and se...,enhanced application layer support for v2x ser...,2.687604e-156,0.157895,0.0,0.0
2,What does the Load-Balancing steering mode do?...,loadbalancing steering mode dynamically direct...,the loadbalancing steering mode splits the tra...,2.491127e-78,0.326531,0.0,0.0
3,What is the main objective of intent driven ma...,to answer this question we need to understand ...,the intent driven management aims to reduce th...,,0.129032,0.0,0.0
4,What does MINT stand for? [3GPP Release 17],to answer this question we need to consider th...,mint stands for minimization of service interr...,,0.040000,0.0,0.0
...,...,...,...,...,...,...,...
95,Which RRC state is the UE in when no RRC conne...,0,when no rrc connection is established the ue i...,,0.000000,0.0,0.0
96,How are the antenna elements placed on each an...,the 3gpp release 17 doesnt explicitly state ho...,the document states that the antenna elements ...,,0.129870,0.0,0.0
97,What information may be provided to an emergen...,the lcs lawful compliance and security client ...,for emergency services the geographic location...,1.195116e-231,0.125000,0.0,0.0
98,What is the purpose of cross-network slice coo...,interoperability efficient network resource al...,crossnetwork slice coordination enables the co...,6.260628e-232,0.052632,0.0,0.0
