# Choose Model Llama 3.2 with Fine-Tuning and Lora Layers

In [1]:
import torch
print("CUDA Available: ", torch.cuda.is_available())
print("CUDA Device Name: ", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

# Verificar se CUDA está disponível para acelerar o processamento
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

CUDA Available:  True
CUDA Device Name:  NVIDIA GeForce RTX 3050 Ti Laptop GPU
Usando dispositivo: cuda


## Llama 3.2 lora (Fine-Tunned)

In [1]:
from unsloth import FastLanguageModel
import torch

2024-12-23 15:38:32.760340: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-23 15:38:32.929361: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-23 15:38:32.991200: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-23 15:38:33.009341: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-23 15:38:33.125242: I tensorflow/core/platform/cpu_feature_guar

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [3]:
model_path = "../../Models/llama_3.2_FT_lora_4000_questions_short_answer_labels"
# model_path = "model_3.2_lora_4bits"

# Carregar o modelo e o tokenizador separadamente
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    dtype = dtype,
    load_in_4bit=load_in_4bit
)

==((====))==  Unsloth 2024.10.6: Fast Llama patching. Transformers = 4.46.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.712 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.6. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.10.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [4]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

# Dataset TeleQnA for Inference

In [5]:
import json

# Path to the TeleQnA processed question in JSON file
rel17_200_questions_path = r"../../Files/rel17_200_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_200_questions_path, "r", encoding="utf-8") as file:
    rel17_200_questions = json.load(file)
print(len(rel17_200_questions))

200


In [26]:
rel17_200_questions[0]

{'question': 'Which NGAP procedure is used for inter-system load balancing? [3GPP Release 17]',
 'option 1': 'eNB Configuration Transfer',
 'option 2': 'Downlink RAN Configuration Transfer',
 'option 3': 'Uplink RAN Configuration Transfer',
 'option 4': 'MME Configuration Transfer',
 'answer': 'option 3: Uplink RAN Configuration Transfer',
 'explanation': 'The NGAP procedure used for inter-system load balancing is Uplink RAN Configuration Transfer.',
 'category': 'Standards overview'}

# Import RAG Functions

In [5]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

In [6]:
from utils.rag_functions import load_faiss_index, search_faiss_index, search_RAG, load_chunks

In [7]:
# Test RAG
query_text = "reception of a transparent l3 message in unacknowledged mode"
index_file_path = "../../Files/faiss_index.bin"
chunks_path = "../../Files/tspec_chunks_markdown.pkl"
top_k = 5

In [19]:
result = search_RAG(query_text, index_file_path, chunks_path, top_k)
print(result)


Information 1:
BSC.  
Collision cases are treated as specified in 3GPPTS44.006.  
If BTS has repeated the DISC frame N200 times, BTS sends a RELease
INDication and an ERRor INDication message to BSC (cf. 3GPPTS44.006).  
![](media/image7.png){width="3.65625in" height="1.2083333333333333in"}  
3.5 Transmission of a transparent L3-Message in acknowledged mode
-----------------------------------------------------------------  
This procedure is used by BSC to request the sending of a L3 message to
MS in acknowledged mode.  
BSC sends a DATA REQuest message to BTS. The message contains the
complete L3 message to be sent in acknowledged mode.  
![](media/image8.png){width="3.6979166666666665in" height="1.0625in"}  
3.6 Reception of a transparent L3-Message in acknowledged mode
--------------------------------------------------------------  
This procedure is used by BTS to indicate the reception of a L3 message
in acknowledged mode.  
BTS sends a DATA INDication message to BSC. The message 

# Accuracy evaluation

## Create prompt and Ask function for Llama 3.2 lora (With Fine-Tuning)

In [7]:
import torch
from unsloth.chat_templates import get_chat_template

def ask_llama_3_2_lora_RAG(model, tokenizer, question_data, top_k=5, index_file_path="../../Files/faiss_index.bin", chunks_path="../../Files/tspec_chunks_markdown.pkl"):
    """
    Function to generate an answer using the model based on the given question and options, 
    including relevant information from a RAG search and prompting a chain of thought.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - question_data: Dictionary containing the question and options.
    - top_k: Number of relevant chunks to retrieve from the search.
    - index_file_path: Path to the FAISS index file.
    - chunks_path: Path to the chunks file.

    Returns:
    - String: Model's generated response.
    """

    # Extract question and options
    question = question_data['question']
    options = [f"{key}: {value}" for key, value in question_data.items() if 'option' in key]
    
    question_search = (
        f"{question}\n" +
        " ".join(options) + " "
    )

    # Perform RAG search using the question to retrieve relevant information
    rag_results = search_RAG(question_search, index_file_path=index_file_path, chunks_path=chunks_path, top_k=top_k)

    # Create the prompt with Chain of Thought (CoT) instructions
    prompt = (
        f"Relevant Information:\n{rag_results}\n"
        f"Question: {question}\n"
        f"Options:\n" + "\n".join(options) + "\n"
        # "Think step by step and analyze the relevant information carefully, then choose the correct option.\n"
        "Think step by step and choose the correct option.\n"
        "You must respond in the format 'correct option: <X>', where <X> is the correct letter for the option."
    )

    # Create the input for the model
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate the response
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=2048,
        temperature=0.7,
        min_p=0.9,
        use_cache=True
    )

    # Clear memory
    del inputs
    torch.cuda.empty_cache()

    # Decode and return the model's output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return response


In [9]:
# Example usage
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

llama_3_2_lora_response = ask_llama_3_2_lora_RAG(model, tokenizer, question_data)
print(llama_3_2_lora_response)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x71602fbd5390>>
Traceback (most recent call last):
  File "/home/arimatea/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Relevant Information:
Information 1:
of OFDM symbols of the PUSCH, including all OFDM symbols used for DMRS;  
\- for any OFDM symbol that carries DMRS of the PUSCH,
$M_{\text{sc}}^{\text{UCI}}\left( l \right) = 0$;  
\- for any OFDM symbol that does not carry DMRS of the PUSCH,
$M_{\text{sc}}^{\text{UCI}}\left( l \right) = M_{\text{sc}}^{\text{PUSCH}} - \ M_{\text{sc}}^{PT - RS}\left( l \right)$;  
\- $\alpha$ is configured by higher layer parameter *scaling*;  
\- $l_{0}$ is the symbol index of the first OFDM symbol that does not
carry DMRS of the PUSCH, after the first DMRS symbol(s), in the PUSCH
transmission.  
For CG-UCI transmission on PUSCH with UL-SCH, and if
*numberOfSlotsTBoMS* is present in the resource allocation table and the
value of *numberOfSlotsTBoMS* in the row indicated by the Time domain
resource assignment field in DCI is larger than 1, the number of coded
modulation symbols per layer fo

## Evaluate Question 

In [8]:
import re

def extract_option(answer):
    """
    Extract the option part from the answer string, removing all punctuation and converting to lowercase.
    
    Parameters:
    - answer: A string containing the answer in the format 'option X: ...'.

    Returns:
    - String: Extracted option (e.g., 'option 2'), or None if no match is found.
    """
    # Remove all punctuation and convert to lowercase
    cleaned_answer = re.sub(r'[^\w\s]', '', answer.lower())
    # Find all matches for the format "option X"
    matches = re.findall(r'option \d+', cleaned_answer)
    # Return the last match with stripped whitespace if any found, otherwise None
    return matches[-1].strip() if matches else None

In [9]:
def extract_response_after_assistant(response):
    """
    Extract the part of the response that comes after the 'assistant' marker.

    Parameters:
    - response: The complete response from the model.

    Returns:
    - String: The extracted relevant part of the response.
    """
    # Split the response based on the 'assistant' marker
    parts = response.split('assistant', 1)
    # Return the part after 'assistant' or the entire response if 'assistant' is not found
    return parts[1].strip() if len(parts) > 1 else response.strip()

In [10]:
def evaluate_model_response(model_response, question_data):
    """
    Compare the model's response with the correct answer from the question data.
    
    Parameters:
    - model_response: The response string generated by the model.
    - question_data: Dictionary containing the question, options, and the correct answer.

    Returns:
    - 1 if the response is correct, otherwise the extracted model option.
    """
    correct_option = extract_option(question_data['answer'])  # Extract correct option
    relevant_response = extract_response_after_assistant(model_response)  # Get relevant part of response
    model_option = extract_option(relevant_response)  # Extract model's option

    return 1 if model_option == correct_option else model_option  # Return 1 if correct, else model's option


In [31]:
question_data = {
    'question': 'Which physical channel informs the UE and the RN about the number of OFDM symbols used for the PDCCHs? [3GPP Release 17]',
    'option 1': 'PBCH',
    'option 2': 'PCFICH',
    'option 3': 'PDSCH',
    'option 4': 'PHICH',
    'answer': 'option 2: PCFICH',
    'explanation': 'The physical control format indicator channel (PCFICH) informs the UE and the RN about the number of OFDM symbols used for the PDCCHs.',
    'category': 'Standards specifications'
}

In [None]:
evaluation_result = evaluate_model_response(llama_3_2_lora_response, question_data)
print(evaluation_result)

## Ask to model Llama 3.2 lora TeleQnA questions 

### Release 17 200 questions

In [11]:
def evaluate_questions(model, tokenizer, questions):
    """
    Process all questions and return the model responses.
    
    Parameters:
    - model: The language model loaded for inference.
    - tokenizer: The tokenizer configured with `get_chat_template`.
    - questions: List of dictionaries containing question data, where each dictionary has:
        - 'question': A string representing the question to be asked to the model.
        - 'answer': A string representing the correct answer format (e.g., 'option 2: PCFICH').
        - 'response': A string that will contain the model's generated response to the question.
    
    Returns:
    - List: A list of dictionaries where each dictionary contains:
        - 'question': The question as a string.
        - 'answer': The correct answer as a string.
        - 'response': The model's generated response for that question.
    """
    
    responses = []
    total_questions = len(questions)
    
    for idx, question_data in enumerate(questions):
        response = ask_llama_3_2_lora_RAG(model, tokenizer, question_data)
        responses.append({
            "question": question_data['question'],
            "answer": question_data['answer'],
            "response": response
        })
        
        # Print progress
        print(f"Responded {idx + 1} of {total_questions} questions...")

    return responses

In [14]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
responses_llama_3_2_lora = evaluate_questions(model, tokenizer, rel17_200_questions)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Responded 1 of 200 questions...
Responded 2 of 200 questions...
Responded 3 of 200 questions...
Responded 4 of 200 questions...
Responded 5 of 200 questions...
Responded 6 of 200 questions...
Responded 7 of 200 questions...
Responded 8 of 200 questions...
Responded 9 of 200 questions...
Responded 10 of 200 questions...
Responded 11 of 200 questions...
Responded 12 of 200 questions...
Responded 13 of 200 questions...
Responded 14 of 200 questions...
Responded 15 of 200 questions...
Responded 16 of 200 questions...
Responded 17 of 200 questions...
Responded 18 of 200 questions...
Responded 19 of 200 questions...
Responded 20 of 200 questions...
Responded 21 of 200 questions...
Responded 22 of 200 questions...
Responded 23 of 200 questions...
Responded 24 of 200 questions...
Responded 25 of 200 questions...
Responded 26 of 200 questions...
Responded 27 of 200 questions...
Responded 28 of 200 questions...
Responded 29 of 200 questions...
Responded 30 of 200 questions...
Responded 31 of 200

In [15]:
print(responses_llama_3_2_lora[1]['response'])

system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Relevant Information:
Information 1:
| Key issue 15   | Solution \#18: | 7.18.2         | \-             |
| -- Supporting  | Support for HD |                |                |
| dynamic        | map dynamic    |                |                |
| information    | information    |                |                |
| for HD maps    |                |                |                |
+----------------+----------------+----------------+----------------+
| Key issue 16   | Solution \#14: | 7.14.2         | \-             |
| --             | Support for    |                |                |
| Enhancements   | enhancements   |                |                |
| to V2X group   | to V2X group   |                |                |
| management and | management and |                |                |
| group          | group          |                |                |
| communication  | communication  |          

#### Save accuracy responses

In [12]:
def save_responses_to_json(responses, filename):
    """
    Save the model responses to a JSON file.
    
    Parameters:
    - responses: List of responses to save.
    - filename: Name of the JSON file.
    """
    
    with open(filename, "w") as json_file:
        json.dump(responses, json_file, indent=4)

In [18]:
# save_responses_to_json(responses_llama_3_2_lora,"../../Models_responses/Accuracy_larger_test/llama_3_2_lora_release_17_200_questions.json")

#### Evaluate responses from Llama 3.2 lora

In [22]:
import json

# Load responses from the JSON file
with open("../../Models_responses/Accuracy_larger_test/llama_3_2_lora_release_17_200_questions.json", "r") as file:
    responses_llama_3_2_lora = json.load(file)

# Print the loaded responses to verify
print("Responses loaded")
# for response in responses_llama_3_2_lora:
#     print(response)


Responses loaded


In [13]:
def evaluate_accuracy(responses_llama_3_2_lora, rel_questions):
    """
    Evaluate the model's responses and calculate accuracy.
    """
    correct_count = 0  # Track the number of correct responses
    none_count = 0  # Track the number of 'None' responses

    for index, question_data in enumerate(responses_llama_3_2_lora):
        evaluation_result = evaluate_model_response(question_data['response'], question_data)
        options = [f"{key}: {value}" for key, value in rel_questions[index].items() if 'option' in key]

        if evaluation_result == 1:
            correct_count += 1  # Increment for correct response
        elif evaluation_result is None:
            # Print only responses that are None
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Full model response:\n{question_data['response']}")
            print(f"Correct response: {question_data['answer']}")
            print("----------------------------------------------------------------------------------------")
            none_count += 1  # Increment for None response
        else:
            print("\nWrong Answer")
            print(f"Question {index + 1}: {question_data['question']}")
            print(f"Options:\n" + "\n".join(options) + "\n")
            print(f"Model response: {evaluation_result}")
            print(f"Correct response: {question_data['answer']}")
            print("----------------------------------------------------------------------------------------")

    # Calculate and print accuracy
    accuracy = correct_count / len(responses_llama_3_2_lora) * 100
    print(f"\nAccuracy: {accuracy:.2f}%")
    print(f"Total 'None' responses: {none_count}")
    print(f"'None' responses means that the model did not give an option")


In [21]:
evaluate_accuracy(responses_llama_3_2_lora, rel17_questions)


Wrong Answer
Question 5: What does MINT stand for? [3GPP Release 17]
Options:
option 1: Maximum Internet Network Transmission
option 2: Minimization of Interruption Network Technology
option 3: Maximization of Internet Network Traffic
option 4: Minimization of Service Interruption
option 5: Maximum Internet Network Technology

Model response: option 2
Correct response: option 4: Minimization of Service Interruption
----------------------------------------------------------------------------------------

Wrong Answer
Question 8: What is a capability added in the V2X Application Enabler (VAE) layer? [3GPP Release 17]
Options:
option 1: V2X UE group management
option 2: V2X service discovery
option 3: Session-oriented services
option 4: Network monitoring by the V2X UE
option 5: Support for PC5 provisioning

Model response: option 5
Correct response: option 2: V2X service discovery
----------------------------------------------------------------------------------------

Wrong Answer
Ques

### Release 18 200 questions

In [15]:
import json

# Path to the TeleQnA processed question in JSON file
rel18_questions_path = r"../../Files/rel18_questions.json"

# Load the TeleQnA data just release 17
with open(rel18_questions_path, "r", encoding="utf-8") as file:
    rel18_questions = json.load(file)
print(len(rel18_questions))

780


In [16]:
rel18_test_size = 200
rel18_questions = rel18_questions[:rel18_test_size]
print(len(rel18_questions))

200


In [17]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
responses_llama_3_2_lora = evaluate_questions(model, tokenizer, rel18_questions)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Responded 1 of 200 questions...
Responded 2 of 200 questions...
Responded 3 of 200 questions...
Responded 4 of 200 questions...
Responded 5 of 200 questions...
Responded 6 of 200 questions...
Responded 7 of 200 questions...
Responded 8 of 200 questions...
Responded 9 of 200 questions...
Responded 10 of 200 questions...
Responded 11 of 200 questions...
Responded 12 of 200 questions...
Responded 13 of 200 questions...
Responded 14 of 200 questions...
Responded 15 of 200 questions...
Responded 16 of 200 questions...
Responded 17 of 200 questions...
Responded 18 of 200 questions...
Responded 19 of 200 questions...
Responded 20 of 200 questions...
Responded 21 of 200 questions...
Responded 22 of 200 questions...
Responded 23 of 200 questions...
Responded 24 of 200 questions...
Responded 25 of 200 questions...
Responded 26 of 200 questions...
Responded 27 of 200 questions...
Responded 28 of 200 questions...
Responded 29 of 200 questions...
Responded 30 of 200 questions...
Responded 31 of 200

#### Save accuracy responses

In [18]:
# save_responses_to_json(responses_llama_3_2_lora,"../../Models_responses/Accuracy_larger_test/llama_3_2_lora_release_18_200_questions.json")

#### Evaluate responses from Llama 3.2 lora

In [24]:
import json

# Load responses from the JSON file
with open("../../Models_responses/Accuracy_larger_test/llama_3_2_lora_release_18_200_questions.json", "r") as file:
    responses_llama_3_2_lora = json.load(file)

# Print the loaded responses to verify
print("Responses loaded")
# for response in responses_llama_3_2_lora:
#     print(response)


Responses loaded


In [25]:
evaluate_accuracy(responses_llama_3_2_lora, rel18_questions)


Wrong Answer
Question 8: What is the purpose of the TRP Measurement Grid? [3GPP Release 18]
Options:
option 1: To calculate the CDF of the EIRP/EIS distribution in 3D
option 2: To determine the total power radiated by the DUT in the TX beam peak direction
option 3: To determine the TX and RX beam peak direction
option 4: To perform TRP measurements taken on the sampling grid
option 5: To perform 3D Throughput/RSRP/EIS scans for RX beam peak direction

Model response: option 2
Correct response: option 4: To perform TRP measurements taken on the sampling grid
----------------------------------------------------------------------------------------

Wrong Answer
Question 11: When can the V-SMF delete the DNS context from the selected V-EASDF? [3GPP Release 18]
Options:
option 1: When the request for HR-SBO is not authorized
option 2: When the V-SMF selects a new V-EASDF
option 3: When the UE initiates a Mobility Registration Update procedure
option 4: When the AF triggers EAS rediscovery


### Other Releases 200 questions

In [14]:
import json

# Path to the TeleQnA processed question in JSON file
other_rel_questions_path = r"../../Files/other_rel_questions.json"

# Load the TeleQnA data just release 17
with open(other_rel_questions_path, "r", encoding="utf-8") as file:
    other_rel_questions = json.load(file)
print(len(other_rel_questions))

4987


In [15]:
other_rel_test_size = 200
other_rel_questions = other_rel_questions[:other_rel_test_size]
print(len(other_rel_questions))

200


In [16]:
model = FastLanguageModel.for_inference(model)  # Enable faster inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Process all questions and get responses
responses_llama_3_2_lora = evaluate_questions(model, tokenizer, other_rel_questions)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Responded 1 of 200 questions...
Responded 2 of 200 questions...
Responded 3 of 200 questions...
Responded 4 of 200 questions...
Responded 5 of 200 questions...
Responded 6 of 200 questions...
Responded 7 of 200 questions...
Responded 8 of 200 questions...
Responded 9 of 200 questions...
Responded 10 of 200 questions...
Responded 11 of 200 questions...
Responded 12 of 200 questions...
Responded 13 of 200 questions...
Responded 14 of 200 questions...
Responded 15 of 200 questions...
Responded 16 of 200 questions...
Responded 17 of 200 questions...
Responded 18 of 200 questions...
Responded 19 of 200 questions...
Responded 20 of 200 questions...
Responded 21 of 200 questions...
Responded 22 of 200 questions...
Responded 23 of 200 questions...
Responded 24 of 200 questions...
Responded 25 of 200 questions...
Responded 26 of 200 questions...
Responded 27 of 200 questions...
Responded 28 of 200 questions...
Responded 29 of 200 questions...
Responded 30 of 200 questions...
Responded 31 of 200

#### Save accuracy responses

In [17]:
# save_responses_to_json(responses_llama_3_2_lora,"../../Models_responses/Accuracy_larger_test/llama_3_2_lora_other_rel_200_questions.json")

#### Evaluate responses from Llama 3.2 lora

In [21]:
import json

# Load responses from the JSON file
with open("../../Models_responses/Accuracy_larger_test/llama_3_2_lora_other_rel_200_questions.json", "r") as file:
    responses_llama_3_2_lora = json.load(file)

# Print the loaded responses to verify
print("Responses loaded")
# for response in responses_llama_3_2_lora:
#     print(response)


Responses loaded


In [22]:
evaluate_accuracy(responses_llama_3_2_lora, other_rel_questions)


Wrong Answer
Question 9: What is the relation between Energy efficiency (EE) and Spectral efficiency (SE) when circuit power consumption is ignored?
Options:
option 1: EE increases monotonically with SE
option 2: EE decreases monotonically with SE
option 3: EE remains constant with increasing SE
option 4: EE increases until a threshold and then decreases with increasing SE
option 5: There is no relation between EE and SE when circuit power consumption is ignored

Model response: option 4
Correct response: option 2: EE decreases monotonically with SE
----------------------------------------------------------------------------------------

Wrong Answer
Question 15: Which feature was introduced in Rel-15 to improve transmission reliability in Carrier Aggregation (CA) for mode-4? [3GPP Release 15]
Options:
option 1: Support for 64-QAM
option 2: Reduction of the maximum time between packet arrival and resource selection
option 3: Transmit diversity
option 4: Radio resource pool sharing
opt