In [1]:
import os 
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


### Follow-Up Prompt Format 3.2 3b-instruct model chat template

To render the following as a block of text, use triple backticks and write it as code:

```python
follow_up_prompt = (
    "<|begin_of_text|>"                              # start of prompt
    "<|start_header_id|>user<|end_header_id|>"        # past  
    f"{question}"                                     # past
    "<|eot_id|>"                                      # past
    "<|start_header_id|>assistant<|end_header_id|>"   # past
    f"{response}"                                     # past
    "<|eot_id|>"                                      # past
    "<|start_header_id|>user<|end_header_id|>"       # new
    f"{follow_up_question}"                          # new
    "<|eot_id|>"                                     # new
    "<|start_header_id|>assistant<|end_header_id|>"  # new
)


In [2]:
df = pd.read_csv('qa_data.csv')
df.head()

Unnamed: 0,questions,answers
0,What are the four main variants of the A320 fa...,"The four main variants are A318, A319, A320, a..."
1,What is the common fuselage design feature of ...,They share a standard six-abreast economy clas...
2,How many seats can the A320 family accommodate?,The A320 family can accommodate between 107 an...
3,How does the A321’s seating capacity compare t...,The A321 has five more seats than the 737-900ER.
4,What is the width of the seats in the A320 fam...,The A320 family offers seats that are 1 inch w...


In [3]:
model_name = '/home/hamna/llama/3.2_3b_instruct'

new_model = 'llama_32_3b_instruct_finetune_40_EPOCHS_1'

output_dir = '/media/hamna/New Volume/pdf_fine_tune_llama_work/model_fine_tune_qa_hamna_dataset'

lora_r = 64

lora_alpha = 16

lora_dropout = 0.1

use_4bit = True

bnb_4bit_compute_dtype = 'float16'

bnb_4bit_quant_type = 'nf4'

use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

output = './results_40_epochs'

num_train_epochs = 40

fp16 = False
bf16 = False

per_device_train_batch_size = 4

per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

gradient_checkpointing = True

max_grad_norm = 0.3 

learning_rate = 2e-4

weight_decay = 0.001

optim = 'paged_adamw_32bit'

lr_scheduler_type = "cosine"

max_steps =  -1

warmup_ratio = 0.03

group_by_length = True

save_steps = 0

logging_steps = 25

################################################################################
# SFT parameters
################################################################################

max_seq_length = None

packing = False 

device_map = {"": 0}



In [60]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = use_4bit,
    bnb_4bit_quant_type = bnb_4bit_quant_type,
    bnb_4bit_computer_dtype = compute_dtype,
    bnb_4bit_use_double_quant = use_nested_quant
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 8)
        print(" Your GPU supports bfloat16: accelearate training with bf16=True")
        print("=" * 80)

#Loading the base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config  =  bnb_config,
    device_map = device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1

#Now Loading the Llama Model architecture Tokenizier
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

 Your GPU supports bfloat16: accelearate training with bf16=True


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


In [12]:
# Load the training dataset
dataset = load_dataset("csv", data_files="qa_data.csv", split="train")
# dataset = load_dataset(dataset, split="train")

# Define a function to apply the chat template
def apply_chat_template(example):
    messages = [
        {"role": "user", "content": example['questions']},
        {"role": "assistant", "content": example['answers']}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

# Apply the chat template function to the dataset
new_dataset = dataset.map(apply_chat_template)
new_dataset = new_dataset.train_test_split(0.1, seed=42)

Map: 100%|██████████| 225/225 [00:00<00:00, 6343.47 examples/s]


In [13]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'answers', 'prompt'],
        num_rows: 202
    })
    test: Dataset({
        features: ['questions', 'answers', 'prompt'],
        num_rows: 23
    })
})

In [14]:
train_dataset = new_dataset['train']
test_dataset = new_dataset['test']

In [64]:
train_dataset

Dataset({
    features: ['questions', 'answers', 'prompt'],
    num_rows: 202
})

In [65]:
train_dataset['questions']

['How are A checks structured under Revision 28 of the A320’s MPD?',
 'What is the estimated current market value (CMV) of a 1988 A320-200 powered by CFM56-5A?',
 'Which engine variant has historically had higher sales in the A320 family?',
 'Which airlines have selected the common CFM56-5B series engine?',
 'How many A319 aircraft does United Airlines operate?',
 'What is the purpose of the Tech Insertion package for the CFM56-5B?',
 'What is the main purpose of the structural modifications in ATA chapter 53?',
 'How many total pre-flight checks are assumed to be performed annually?',
 'How does improving fault detection affect flight delays?',
 'What engine variant powers the majority of the A319 fleet?',
 'How are heavy components categorized in A320 maintenance?',
 'What engine was developed by International Aero Engines (IAE) for the A320-200?',
 'What thrust rating was first offered on the A320 in the mid-1990s?',
 'What percentage of the A320 family fleet is made up of the A320 

In [66]:
train_dataset['answers']

['A checks are grouped as a generic check and typically performed every 450 FH.',
 'The CMV is approximately $13.5 million.',
 'CFM56-powered A320s have historically outsold those powered by V.2500 engines.',
 'Air France and Iberia use the CFM56-5B across all four A320 family variants.',
 'United Airlines operates 78 A319 aircraft.',
 'To improve fuel burn, increase durability, and enhance exhaust gas temperature (EGT) margin.',
 'To address cracks and fatigue issues around rivets and fittings in the fuselage and landing gear areas.',
 'Approximately 355 pre-flight checks.',
 'It allows line mechanics to prepare in advance, reducing scheduled gate time and minimizing delays.',
 'The -5B5 variant powers the majority of the A319 fleet.',
 'Heavy components are categorized into wheels, tyres, brakes, landing gears, thrust reversers, and the APU.',
 'The V.2500-A1 engine was developed for the A320-200.',
 'The CFM56-5B4 engine rated at 27,000 lbs thrust.',
 'The A320 and A319 account for 

In [67]:
#Loading the Lora Configration
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

#Setting the Training Parameters for model Training
training_arguments = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps, 
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    weight_decay = weight_decay,
    save_total_limit=5,
    fp16 = fp16,
    bf16=bf16,
    max_grad_norm = max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    group_by_length = group_by_length,
    lr_scheduler_type = lr_scheduler_type,
    report_to = 'tensorboard'
)

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    peft_config = peft_config,
    dataset_text_field = "prompt",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = training_arguments,
    packing = packing
)

trainer.train()



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


{'loss': 3.9094, 'grad_norm': 0.5839182138442993, 'learning_rate': 8.064516129032258e-05, 'epoch': 0.49019607843137253}
{'loss': 2.3989, 'grad_norm': 0.5574783086776733, 'learning_rate': 0.00016129032258064516, 'epoch': 0.9803921568627451}
{'loss': 1.4803, 'grad_norm': 0.7125798463821411, 'learning_rate': 0.00019997868484717502, 'epoch': 1.4705882352941178}
{'loss': 1.2866, 'grad_norm': 0.5978853702545166, 'learning_rate': 0.0001998179240714399, 'epoch': 1.9607843137254903}
{'loss': 1.2407, 'grad_norm': 0.7319638133049011, 'learning_rate': 0.0001994998089790829, 'epoch': 2.450980392156863}
{'loss': 1.1452, 'grad_norm': 0.7047911286354065, 'learning_rate': 0.00019902484105100974, 'epoch': 2.9411764705882355}
{'loss': 1.0632, 'grad_norm': 0.6869220733642578, 'learning_rate': 0.0001983937690330437, 'epoch': 3.431372549019608}
{'loss': 1.0632, 'grad_norm': 0.700506329536438, 'learning_rate': 0.00019760758775559274, 'epoch': 3.9215686274509802}
{'loss': 0.978, 'grad_norm': 0.650979042053222

TrainOutput(global_step=2040, training_loss=0.4236034050876019, metrics={'train_runtime': 823.9019, 'train_samples_per_second': 9.807, 'train_steps_per_second': 2.476, 'train_loss': 0.4236034050876019, 'epoch': 40.0})

In [68]:
trainer.model.save_pretrained(new_model)

'''python
<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 22 Nov 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow are A checks structured under Revision 28 of the A320’s MPD?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA checks are grouped as a generic check and typically performed every 450 FH.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

In [33]:
question = "What is the range of seating capacities for the A320 family?"

prompt = (
    "<|begin_of_text|>"                              # start of prompt
    "<|start_header_id|>user<|end_header_id|>"       # user header
    f"{question}"                                    # user input
    "<|eot_id|>"                                     #end of turn
    "<|start_header_id|>assistant<|end_header_id|>"  #assistant header
)

print(prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the range of seating capacities for the A320 family?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [34]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the range of seating capacities for the A320 family?<|eot_id|><|start_header_id|>assistant<|end_header_id|>How many seats can the A320 family accommodate?


In [35]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the range of seating capacities for the A320 family?<|eot_id|><|start_header_id|>assistant<|end_header_id|>How many options are there for range?


In [85]:
import torch

torch.cuda.empty_cache()


In [76]:
# del model
del tokenizer
torch.cuda.empty_cache()


In [86]:
print(torch.cuda.memory_summary())


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 6            |        cudaMalloc retries: 6         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  17895 MiB |  17895 MiB | 296269 GiB | 296251 GiB |
|       from large pool |  17655 MiB |  17655 MiB | 294385 GiB | 294368 GiB |
|       from small pool |    240 MiB |    240 MiB |   1883 GiB |   1882 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  17895 MiB |  17895 MiB | 296269 GiB | 296251 GiB |
|       from large pool |  17655 MiB |  17655 MiB | 294385 GiB | 294368 GiB |
|       from small pool |    240 MiB |    240 MiB |   1883 GiB |   1882 GiB |
|---------------------------------------------------------------

In [81]:
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()


In [83]:
torch.cuda.reset_peak_memory_stats()
print("CUDA peak memory stats reset.")


CUDA peak memory stats reset.


In [87]:
!nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Thu Nov 28 11:18:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:65:00.0  On |                  N/A |
| 53%   36C    P2            155W /  390W |   24028MiB /  24576MiB |     15%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


In [None]:
# trainer.model.save_pretrained('/media/hamna/New Volume/pdf_fine_tune_llama_work/fine_tune_llama_32_3b_pdf/fine_tune_model_weights_60epochs')

In [16]:

import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# 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

## Local wegiths saving

In [5]:
# Define the directory where you want to save the model and tokenizer
local_dir = "fine_tune_32_3b_model_40_qa_custom_1"

# Save the model locally
model.save_pretrained(local_dir)

# Save the tokenizer locally
tokenizer.save_pretrained(local_dir)

print(f"Model and tokenizer saved locally in {local_dir}")


Model and tokenizer saved locally in fine_tune_32_3b_model_40_qa_custom_1


In [None]:
# model.push_to_hub("Hassan883/llama_32_3b_fine_tune_model_50epoch_1", check_pr=True)

# tokenizer.push_to_hub("Hassan883/llama_32_3b_fine_tune_model_50epoch_1",check_pr=True)

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]
model-00002-of-00002.safetensors: 100%|██████████| 1.46G/1.46G [10:15<00:00, 2.37MB/s] 

model-00001-of-00002.safetensors: 100%|██████████| 4.97G/4.97G [25:05<00:00, 3.30MB/s]

Upload 2 LFS files: 100%|██████████| 2/2 [25:05<00:00, 752.94s/it]
tokenizer.json: 100%|██████████| 17.2M/17.2M [00:08<00:00, 1.99MB/s]


CommitInfo(commit_url='https://huggingface.co/Hassan883/llama_32_3b_fine_tune_model_50epoch_1/commit/c24181ea42d796f37fde14e9b1398698ea149e40', commit_message='Upload tokenizer', commit_description='', oid='c24181ea42d796f37fde14e9b1398698ea149e40', pr_url=None, pr_revision=None, pr_num=None)

# Inferencing on finetuned model

In [6]:
model_name = 'fine_tune_32_3b_model_40_qa_custom_1'
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = use_4bit,
    bnb_4bit_quant_type = bnb_4bit_quant_type,
    bnb_4bit_computer_dtype = compute_dtype,
    bnb_4bit_use_double_quant = use_nested_quant
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 8)
        print(" Your GPU supports bfloat16: accelearate training with bf16=True")
        print("=" * 80)

#Loading the base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config  =  bnb_config,
    device_map = device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1

#Now Loading the Llama Model architecture Tokenizier
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Unused kwargs: ['bnb_4bit_computer_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


 Your GPU supports bfloat16: accelearate training with bf16=True


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]


In [7]:
question = "What is the range of seating capacities for the A320 family?"

prompt = (
    "<|begin_of_text|>"                              # start of prompt
    "<|start_header_id|>user<|end_header_id|>"       # user header
    f"{question}"                                    # user input
    "<|eot_id|>"                                     #end of turn
    "<|start_header_id|>assistant<|end_header_id|>"  #assistant header
)

print(prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the range of seating capacities for the A320 family?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [8]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
def inference_func(model, tokenizer,prompt, max_seq_length=200):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    print(result[0]['generated_text'])

In [9]:
def prompt_template_func(question):
    question = question

    prompt = (
        "<|begin_of_text|>"                              # start of prompt
        "<|start_header_id|>user<|end_header_id|>"       # user header
        f"{question}"                                    # user input
        "<|eot_id|>"                                     #end of turn
        "<|start_header_id|>assistant<|end_header_id|>"  #assistant header
    )

    print(prompt)
    return prompt

In [10]:
inference_func(model=model,tokenizer=tokenizer, prompt=prompt)



<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the range of seating capacities for the A320 family?<|eot_id|><|start_header_id|>assistant<|end_header_id|>The range is from 107 to 185 seats.


In [15]:
test_dataset

Dataset({
    features: ['questions', 'answers', 'prompt'],
    num_rows: 23
})

In [44]:
test_dataset['questions']

['What thrust ratings are available for the CFM56-5B engines on the A321?',
 'How are post-flight technical logs maintained?',
 'How many MH are typically consumed during a C8 check?',
 'How does the A321’s seating capacity compare to the 737-900ER?',
 'When is the detailed engineering work for future freighter conversions expected to begin?',
 'What problem was detected in the A320 elevators related to water ingress?',
 'What thrust ratings are available for the V.2500-A5 engines on the A319?',
 'What are the fuel capacities available for the A321 with supplementary tanks?',
 'What flight control system is featured in the A320 family?',
 'What was the route used to analyze fuel burn performance?',
 'What is the role of the full authority digital engine control (FADEC) system?',
 'What is the first step in the process of clearing technical defects?',
 'What is the annual cost for materials during daily checks?',
 'What are the three categories of modification programs available for the

In [45]:
test_dataset['answers']

['The thrust ratings include -5B4 (27,000 lbs), -5B1 (30,000 lbs), -5B2 (31,000 lbs), and -5B3 (33,000 lbs).',
 'Flight crew log ECAM messages in the post-flight technical log.',
 'A C8 check can consume around 20,000 MH, including various tasks.',
 'The A321 has five more seats than the 737-900ER.',
 'In 2008.',
 'Cracks in the honeycomb panels due to fatigue.',
 'The thrust ratings are V.2522-A5 (22,000 lbs), V.2524-A5 (23,500 lbs), and V.2527-A5 (26,500 lbs).',
 'The total capacities are 7,040 USG and 7,800 USG.',
 'The A320 family features a fly-by-wire (FBW) flight control system.',
 'The London Heathrow to Munich route.',
 'It allows for easy changes to thrust ratings, with each having a different list price.',
 'Logging and troubleshooting the defects.',
 'About $125,000.',
 'Engine upgrades, avionics, and future passenger-to-freighter conversions.',
 'The A320 family received 4,283 orders by the end of 2005.',
 'It provides high commonality in flightcrew and maintenance-related

In [46]:
question = 'How often are TR checks typically performed for the A320?'

prompt = prompt_template_func(question=question)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>How often are TR checks typically performed for the A320?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [47]:
inference_func(model=model, tokenizer=tokenizer, prompt=prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>How often are TR checks typically performed for the A320?<|eot_id|><|start_header_id|>assistant<|end_header_id|>How often are inspection cycles performed for the A320?


In [48]:
from datasets import Dataset
from transformers import pipeline
import logging
import re
# from datasets import load_metric
# Function for inference
def inference_func(model, tokenizer, prompt, max_seq_length=200):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    return result[0]['generated_text']

# Function to preprocess test dataset
def preprocess_prompt(question):
    prompt = (
        "<|begin_of_text|>"                               # Start of prompt
        "<|start_header_id|>system<|end_header_id|>\n\n"  # System role
        "Please answer the question factually and concisely.\n\n"  # Instruction
        "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"  # User header
        f"{question}\n\n"                                 # User input
        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # Assistant header
    )
    return prompt


# Load test dataset (Replace 'your_test_dataset' with your actual dataset)
# test_dataset = Dataset.from_dict({
#     "questions": [
#         "What engines are available for the A318?",
#         "What is the first step in the process of clearing technical defects?",
#         # Add other test questions here...
#     ],
#     "answers": [
#         "The A318 utilizes the CFM56-5B series and the PW6000 series.",
#         "Logging and troubleshooting the defects.",
#         # Add corresponding ground truth answers here...
#     ]
# })

# Evaluate the model on the test dataset
# Load F1 metric



In [51]:
from datasets import Dataset, Value
from transformers import pipeline, AutoTokenizer
import logging
import re
import evaluate

# Function for inference
def inference_func(model, tokenizer, prompt, max_seq_length=200):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    return result[0]['generated_text']

# Function to preprocess test dataset
def preprocess_prompt(question):
    prompt = (
        "<|begin_of_text|>"                               # Start of prompt
        "<|start_header_id|>system<|end_header_id|>\n\n"  # System role
        "Please answer the question factually and concisely.\n\n"  # Instruction
        "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"  # User header
        f"{question}\n\n"                                 # User input
        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # Assistant header
    )
    return prompt

# F1 metric
f1_metric = evaluate.load("f1")

# Normalize function for consistent string comparison
def normalize_answer(answer):
    """Normalize text for fair comparison."""
    if not isinstance(answer, str):
        answer = str(answer)  # Convert non-string types to strings
    answer = answer.lower().strip()
    answer = re.sub(r"[^a-z0-9\s]", "", answer)  # Remove special characters
    answer = re.sub(r"\s+", " ", answer)        # Normalize spaces
    return answer

# Function to clean and validate the dataset
def clean_dataset(dataset):
    """Ensure all entries in the dataset are strings."""
    cleaned_questions = []
    cleaned_answers = []
    for question, answer in zip(dataset["questions"], dataset["answers"]):
        if not isinstance(question, str):
            question = str(question)  # Convert to string if not already
        if not isinstance(answer, str):
            answer = str(answer)  # Convert to string if not already
        cleaned_questions.append(question.strip())
        cleaned_answers.append(answer.strip())
    return Dataset.from_dict({"questions": cleaned_questions, "answers": cleaned_answers})

# Updated evaluation function
def evaluate_model_with_f1(model, tokenizer, test_dataset):
    predictions = []
    references = []
    
    for example in test_dataset:
        # Prepare the prompt
        prompt = preprocess_prompt(example["questions"])

        # Generate the model's prediction
        generated_text = inference_func(model, tokenizer, prompt, max_seq_length=200)

        # Extract and clean prediction
        response_start = "<|start_header_id|>assistant<|end_header_id|>"
        if response_start in generated_text:
            prediction = generated_text.split(response_start, 1)[1].strip()
        else:
            prediction = generated_text.strip()

        predictions.append(prediction)
        references.append(example["answers"])

    # Validate data types
    for pred, ref in zip(predictions, references):
        print(f"Prediction: {pred} ({type(pred)}), Reference: {ref} ({type(ref)})")

test_dataset = clean_dataset(test_dataset)

# Ensure dataset columns are properly typed
test_dataset_t = test_dataset.cast_column("questions", Value("string"))
test_dataset_t = test_dataset.cast_column("answers", Value("string"))

# Example usage (Replace `model` and `tokenizer` with your fine-tuned LLaMA model and tokenizer)
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained("your_model_path")
# tokenizer = AutoTokenizer.from_pretrained("your_model_path")

# Run evaluation
evaluate_model_with_f1(model, tokenizer, test_dataset_t)


Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 5714.31 examples/s]
Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 5503.39 examples/s]


Prediction: TR checks are typically performed every 5-6 months. (<class 'str'>), Reference: TR checks are performed about 1,480 times a year. (<class 'str'>)
Prediction: There are 97 A320s powered by CFM56-5B4 engines. (<class 'str'>), Reference: 766 aircraft are powered by CFM56-5B4 engines. (<class 'str'>)
Prediction: The A320 family has delivered over 1,700 aircraft until now. (<class 'str'>), Reference: More than 2,600 aircraft have been delivered. (<class 'str'>)
Prediction: The A318 received 95 orders, while the 737-600 received 127. (<class 'str'>), Reference: The A318's performance, while disappointing, still exceeds that of the 737-600. (<class 'str'>)
Prediction: The supplementary fuel tanks increase the total capacity to 7,000 USG. (<class 'str'>), Reference: The total capacity can be increased to 7,066 USG. (<class 'str'>)
Prediction: Approximately 180 MH annually. (<class 'str'>), Reference: Around 520 MH per year. (<class 'str'>)
Prediction: The CFM56-5B and V.2500-A5 ser

In [16]:
from datasets import Dataset, Value
from transformers import pipeline, AutoTokenizer
import re
import evaluate

# Function for inference
def inference_func(model, tokenizer, prompt, max_seq_length=200):
    """Generate text using the model and tokenizer."""
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    return result[0]['generated_text']

# Function to preprocess a question into the desired prompt format
def preprocess_prompt(question):
    """Format a question into the desired prompt template."""
    prompt = (
        "<|begin_of_text|>"                               # Start of prompt
        "<|start_header_id|>system<|end_header_id|>\n\n"  # System role
        "Please answer the question factually and concisely.\n\n"  # Instruction
        "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"  # User header
        f"{question}\n\n"                                 # User input
        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # Assistant header
    )
    return prompt

# Normalize function for consistent string comparison
def normalize_answer(answer):
    """Normalize text for fair comparison."""
    if not isinstance(answer, str):
        answer = str(answer)  # Convert non-string types to strings
    answer = answer.lower().strip()
    answer = re.sub(r"[^a-z0-9\s]", "", answer)  # Remove special characters
    answer = re.sub(r"\s+", " ", answer)        # Normalize spaces
    return answer

# Function to clean and validate the dataset
def clean_dataset(dataset):
    """Ensure all entries in the dataset are strings."""
    cleaned_questions = []
    cleaned_answers = []
    for question, answer in zip(dataset["questions"], dataset["answers"]):
        if not isinstance(question, str):
            question = str(question)  # Convert to string if not already
        if not isinstance(answer, str):
            answer = str(answer)  # Convert to string if not already
        cleaned_questions.append(question.strip())
        cleaned_answers.append(answer.strip())
    return Dataset.from_dict({"questions": cleaned_questions, "answers": cleaned_answers})

# Updated inference function to handle unknown answers
def inference_with_default_response(model, tokenizer, question, max_seq_length=200):
    """Generate a response and return 'Sorry, I don't know' if the model does not provide a valid answer."""
    prompt = preprocess_prompt(question)
    generated_text = inference_func(model, tokenizer, prompt, max_seq_length=max_seq_length)

    # Extract the assistant's response
    response_start = "<|start_header_id|>assistant<|end_header_id|>"
    if response_start in generated_text:
        prediction = generated_text.split(response_start, 1)[1].strip()
    else:
        prediction = generated_text.strip()

    # Handle cases where the model cannot generate a response
    if not prediction or prediction.lower() in ["i don't know", "i am not sure", ""]:
        return "Sorry, I don't know."
    return prediction

# # Example Dataset (Replace this with your actual dataset)
# test_dataset = Dataset.from_dict({
#     "questions": [
#         "What engines are available for the A318?",
#         "What is the first step in the process of clearing technical defects?",
#         "How long is the wingspan of the A320?",
#         "What is the purpose of life?"  # Example for a question the model might not know
#     ],
#     "answers": [
#         "The A318 utilizes the CFM56-5B series and the PW6000 series.",
#         "Logging and troubleshooting the defects.",
#         "The wingspan of the A320 is approximately 34.1 meters.",
#         "Sorry, I don't know."
#     ],
# })

# Clean and validate dataset
test_dataset = clean_dataset(test_dataset)

# Ensure dataset columns are properly typed
test_dataset = test_dataset.cast_column("questions", Value("string"))
test_dataset = test_dataset.cast_column("answers", Value("string"))

# # Example usage (Replace `model` and `tokenizer` with your fine-tuned LLaMA model and tokenizer)
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained("your_model_path")
# tokenizer = AutoTokenizer.from_pretrained("your_model_path")

# Testing the model on questions
for idx, example in enumerate(test_dataset):
    question = example["questions"]
    expected_answer = example["answers"]
    generated_answer = inference_with_default_response(model, tokenizer, question)
    print(f"Q{idx+1}: {question}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Expected Answer: {expected_answer}")
    print("-" * 30)


Casting the dataset:   0%|          | 0/23 [00:00<?, ? examples/s]

Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 5256.02 examples/s]
Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 7572.73 examples/s]


Q1: What thrust ratings are available for the CFM56-5B engines on the A321?
Generated Answer: The available thrust ratings are 25,000 lbs. (11.4 kN) and 27,000 lbs. (12.1 kN).
Expected Answer: The thrust ratings include -5B4 (27,000 lbs), -5B1 (30,000 lbs), -5B2 (31,000 lbs), and -5B3 (33,000 lbs).
------------------------------
Q2: How are post-flight technical logs maintained?
Generated Answer: Post-flight technical logs are typically maintained on the flight’s maintenance record.
Expected Answer: Flight crew log ECAM messages in the post-flight technical log.
------------------------------
Q3: How many MH are typically consumed during a C8 check?
Generated Answer: Six MH.
Expected Answer: A C8 check can consume around 20,000 MH, including various tasks.
------------------------------
Q4: How does the A321’s seating capacity compare to the 737-900ER?
Generated Answer: The A321’s two-class seating capacity is 185 versus 177 for the 737-900ER.
Expected Answer: The A321 has five more se