In [1]:
import os 
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


### Follow-Up Prompt Format 3.2 3b-instruct model chat template

To render the following as a block of text, use triple backticks and write it as code:

```python
follow_up_prompt = (
    "<|begin_of_text|>"                              # start of prompt
    "<|start_header_id|>user<|end_header_id|>"        # past  
    f"{question}"                                     # past
    "<|eot_id|>"                                      # past
    "<|start_header_id|>assistant<|end_header_id|>"   # past
    f"{response}"                                     # past
    "<|eot_id|>"                                      # past
    "<|start_header_id|>user<|end_header_id|>"       # new
    f"{follow_up_question}"                          # new
    "<|eot_id|>"                                     # new
    "<|start_header_id|>assistant<|end_header_id|>"  # new
)


In [2]:
df = pd.read_csv('qa_data.csv')
df.head()

Unnamed: 0,questions,answers
0,What are the four main variants of the A320 fa...,"The four main variants are A318, A319, A320, a..."
1,What is the common fuselage design feature of ...,They share a standard six-abreast economy clas...
2,How many seats can the A320 family accommodate?,The A320 family can accommodate between 107 an...
3,How does the A321’s seating capacity compare t...,The A321 has five more seats than the 737-900ER.
4,What is the width of the seats in the A320 fam...,The A320 family offers seats that are 1 inch w...


In [5]:
model_name = '/home/hamna/llama/3.2_3b_instruct'

new_model = 'llama_32_3b_instruct_finetune_60_EPOCHS_1'

output_dir = '/media/hamna/New Volume/pdf_fine_tune_llama_work/model_fine_tune_qa_hamna_dataset'

lora_r = 64

lora_alpha = 16

lora_dropout = 0.1

use_4bit = True

bnb_4bit_compute_dtype = 'float16'

bnb_4bit_quant_type = 'nf4'

use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

output = './results'

num_train_epochs = 60

fp16 = False
bf16 = False

per_device_train_batch_size = 4

per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

gradient_checkpointing = True

max_grad_norm = 0.3 

learning_rate = 2e-4

weight_decay = 0.001

optim = 'paged_adamw_32bit'

lr_scheduler_type = "cosine"

max_steps =  -1

warmup_ratio = 0.03

group_by_length = True

save_steps = 0

logging_steps = 25

################################################################################
# SFT parameters
################################################################################

max_seq_length = None

packing = False 

device_map = {"": 0}



In [4]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = use_4bit,
    bnb_4bit_quant_type = bnb_4bit_quant_type,
    bnb_4bit_computer_dtype = compute_dtype,
    bnb_4bit_use_double_quant = use_nested_quant
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 8)
        print(" Your GPU supports bfloat16: accelearate training with bf16=True")
        print("=" * 80)

#Loading the base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config  =  bnb_config,
    device_map = device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1

#Now Loading the Llama Model architecture Tokenizier
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Unused kwargs: ['bnb_4bit_computer_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


 Your GPU supports bfloat16: accelearate training with bf16=True


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


In [15]:
# Load the training dataset
dataset = load_dataset("csv", data_files="qa_data.csv", split="train")
# dataset = load_dataset(dataset, split="train")

# Define a function to apply the chat template
def apply_chat_template(example):
    messages = [
        {"role": "user", "content": example['questions']},
        {"role": "assistant", "content": example['answers']}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

# Apply the chat template function to the dataset
new_dataset = dataset.map(apply_chat_template)
new_dataset = new_dataset.train_test_split(0.1, seed=42)

Map: 100%|██████████| 225/225 [00:00<00:00, 5604.39 examples/s]


In [16]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'answers', 'prompt'],
        num_rows: 202
    })
    test: Dataset({
        features: ['questions', 'answers', 'prompt'],
        num_rows: 23
    })
})

In [17]:
train_dataset = new_dataset['train']
test_dataset = new_dataset['test']

In [18]:
train_dataset

Dataset({
    features: ['questions', 'answers', 'prompt'],
    num_rows: 202
})

In [19]:
train_dataset['questions']

['How are A checks structured under Revision 28 of the A320’s MPD?',
 'What is the estimated current market value (CMV) of a 1988 A320-200 powered by CFM56-5A?',
 'Which engine variant has historically had higher sales in the A320 family?',
 'Which airlines have selected the common CFM56-5B series engine?',
 'How many A319 aircraft does United Airlines operate?',
 'What is the purpose of the Tech Insertion package for the CFM56-5B?',
 'What is the main purpose of the structural modifications in ATA chapter 53?',
 'How many total pre-flight checks are assumed to be performed annually?',
 'How does improving fault detection affect flight delays?',
 'What engine variant powers the majority of the A319 fleet?',
 'How are heavy components categorized in A320 maintenance?',
 'What engine was developed by International Aero Engines (IAE) for the A320-200?',
 'What thrust rating was first offered on the A320 in the mid-1990s?',
 'What percentage of the A320 family fleet is made up of the A320 

In [20]:
train_dataset['answers']

['A checks are grouped as a generic check and typically performed every 450 FH.',
 'The CMV is approximately $13.5 million.',
 'CFM56-powered A320s have historically outsold those powered by V.2500 engines.',
 'Air France and Iberia use the CFM56-5B across all four A320 family variants.',
 'United Airlines operates 78 A319 aircraft.',
 'To improve fuel burn, increase durability, and enhance exhaust gas temperature (EGT) margin.',
 'To address cracks and fatigue issues around rivets and fittings in the fuselage and landing gear areas.',
 'Approximately 355 pre-flight checks.',
 'It allows line mechanics to prepare in advance, reducing scheduled gate time and minimizing delays.',
 'The -5B5 variant powers the majority of the A319 fleet.',
 'Heavy components are categorized into wheels, tyres, brakes, landing gears, thrust reversers, and the APU.',
 'The V.2500-A1 engine was developed for the A320-200.',
 'The CFM56-5B4 engine rated at 27,000 lbs thrust.',
 'The A320 and A319 account for 

In [10]:
#Loading the Lora Configration
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

#Setting the Training Parameters for model Training
training_arguments = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps, 
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    weight_decay = weight_decay,
    save_total_limit=5,
    fp16 = fp16,
    bf16=bf16,
    max_grad_norm = max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    group_by_length = group_by_length,
    lr_scheduler_type = lr_scheduler_type,
    report_to = 'tensorboard'
)

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    peft_config = peft_config,
    dataset_text_field = "prompt",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = training_arguments,
    packing = packing
)

trainer.train()



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 202/202 [00:00<00:00, 6877.19 examples/s]
  1%|          | 25/3060 [00:10<20:05,  2.52it/s]

{'loss': 4.0102, 'grad_norm': 0.7412229776382446, 'learning_rate': 5.4347826086956524e-05, 'epoch': 0.49}


  2%|▏         | 50/3060 [00:19<19:26,  2.58it/s]

{'loss': 2.769, 'grad_norm': 0.6650186777114868, 'learning_rate': 0.00010869565217391305, 'epoch': 0.98}


  2%|▏         | 75/3060 [00:28<17:00,  2.93it/s]

{'loss': 1.7183, 'grad_norm': 0.6771199703216553, 'learning_rate': 0.00016304347826086955, 'epoch': 1.47}


  3%|▎         | 100/3060 [00:38<18:03,  2.73it/s]

{'loss': 1.3172, 'grad_norm': 0.5408714413642883, 'learning_rate': 0.0001999964147509006, 'epoch': 1.96}


  4%|▍         | 125/3060 [00:48<18:28,  2.65it/s]

{'loss': 1.2018, 'grad_norm': 0.5721356272697449, 'learning_rate': 0.00019993900058381424, 'epoch': 2.45}


  5%|▍         | 150/3060 [00:58<18:27,  2.63it/s]

{'loss': 1.2059, 'grad_norm': 0.6049203872680664, 'learning_rate': 0.0001998116084001951, 'epoch': 2.94}


  6%|▌         | 175/3060 [01:09<20:04,  2.40it/s]

{'loss': 1.1066, 'grad_norm': 0.6771843433380127, 'learning_rate': 0.00019961432740097854, 'epoch': 3.43}


  7%|▋         | 200/3060 [01:19<19:57,  2.39it/s]

{'loss': 1.0826, 'grad_norm': 0.7544880509376526, 'learning_rate': 0.00019934729572375793, 'epoch': 3.92}


  7%|▋         | 225/3060 [01:30<18:43,  2.52it/s]

{'loss': 1.0207, 'grad_norm': 0.7622149586677551, 'learning_rate': 0.0001990107003460597, 'epoch': 4.41}


  8%|▊         | 250/3060 [01:40<18:02,  2.60it/s]

{'loss': 0.9916, 'grad_norm': 0.883090615272522, 'learning_rate': 0.00019860477695442027, 'epoch': 4.9}


  9%|▉         | 275/3060 [01:49<18:58,  2.45it/s]

{'loss': 0.9034, 'grad_norm': 0.7297604084014893, 'learning_rate': 0.00019812980977935678, 'epoch': 5.39}


 10%|▉         | 300/3060 [02:00<19:14,  2.39it/s]

{'loss': 0.8921, 'grad_norm': 1.0510079860687256, 'learning_rate': 0.0001975861313963466, 'epoch': 5.88}


 11%|█         | 325/3060 [02:11<20:12,  2.26it/s]

{'loss': 0.8258, 'grad_norm': 1.5527222156524658, 'learning_rate': 0.00019697412249295565, 'epoch': 6.37}


 11%|█▏        | 350/3060 [02:21<18:39,  2.42it/s]

{'loss': 0.8148, 'grad_norm': 0.8509286642074585, 'learning_rate': 0.00019629421160227822, 'epoch': 6.86}


 12%|█▏        | 375/3060 [02:31<18:29,  2.42it/s]

{'loss': 0.7461, 'grad_norm': 0.892947256565094, 'learning_rate': 0.00019554687480287497, 'epoch': 7.35}


 13%|█▎        | 400/3060 [02:42<18:48,  2.36it/s]

{'loss': 0.7195, 'grad_norm': 1.0358073711395264, 'learning_rate': 0.00019473263538541914, 'epoch': 7.84}


 14%|█▍        | 425/3060 [02:52<19:27,  2.26it/s]

{'loss': 0.6835, 'grad_norm': 1.0299057960510254, 'learning_rate': 0.0001938520634862848, 'epoch': 8.33}


 15%|█▍        | 450/3060 [03:03<19:23,  2.24it/s]

{'loss': 0.6514, 'grad_norm': 1.4353010654449463, 'learning_rate': 0.00019290577568833306, 'epoch': 8.82}


 16%|█▌        | 475/3060 [03:13<18:52,  2.28it/s]

{'loss': 0.6115, 'grad_norm': 1.640794038772583, 'learning_rate': 0.00019189443458917624, 'epoch': 9.31}


 16%|█▋        | 500/3060 [03:24<18:09,  2.35it/s]

{'loss': 0.6049, 'grad_norm': 1.3335316181182861, 'learning_rate': 0.00019081874833722236, 'epoch': 9.8}


 17%|█▋        | 525/3060 [03:34<18:00,  2.35it/s]

{'loss': 0.5676, 'grad_norm': 1.3223310708999634, 'learning_rate': 0.00018967947013582426, 'epoch': 10.29}


 18%|█▊        | 550/3060 [03:45<18:24,  2.27it/s]

{'loss': 0.5579, 'grad_norm': 1.4487848281860352, 'learning_rate': 0.0001884773977158813, 'epoch': 10.78}


 19%|█▉        | 575/3060 [03:55<17:30,  2.36it/s]

{'loss': 0.5283, 'grad_norm': 1.4932363033294678, 'learning_rate': 0.0001872133727772622, 'epoch': 11.27}


 20%|█▉        | 600/3060 [04:06<18:26,  2.22it/s]

{'loss': 0.5276, 'grad_norm': 1.2499629259109497, 'learning_rate': 0.00018588828039944086, 'epoch': 11.76}


 20%|██        | 625/3060 [04:16<16:23,  2.48it/s]

{'loss': 0.4971, 'grad_norm': 0.7460174560546875, 'learning_rate': 0.00018450304842175706, 'epoch': 12.25}


 21%|██        | 650/3060 [04:27<17:10,  2.34it/s]

{'loss': 0.4933, 'grad_norm': 1.2233078479766846, 'learning_rate': 0.00018305864679373667, 'epoch': 12.75}


 22%|██▏       | 675/3060 [04:37<15:33,  2.55it/s]

{'loss': 0.4757, 'grad_norm': 0.586082935333252, 'learning_rate': 0.00018155608689592604, 'epoch': 13.24}


 23%|██▎       | 700/3060 [04:48<16:51,  2.33it/s]

{'loss': 0.453, 'grad_norm': 1.2250467538833618, 'learning_rate': 0.00017999642083171575, 'epoch': 13.73}


 24%|██▎       | 725/3060 [04:58<15:18,  2.54it/s]

{'loss': 0.424, 'grad_norm': 0.49558237195014954, 'learning_rate': 0.00017838074069065032, 'epoch': 14.22}


 25%|██▍       | 750/3060 [05:08<14:09,  2.72it/s]

{'loss': 0.3711, 'grad_norm': 0.6939971446990967, 'learning_rate': 0.00017671017778373913, 'epoch': 14.71}


 25%|██▌       | 775/3060 [05:18<14:56,  2.55it/s]

{'loss': 0.2775, 'grad_norm': 0.6757650375366211, 'learning_rate': 0.00017498590185130412, 'epoch': 15.2}


 26%|██▌       | 800/3060 [05:28<13:18,  2.83it/s]

{'loss': 0.2546, 'grad_norm': 0.9811782240867615, 'learning_rate': 0.00017320912024391917, 'epoch': 15.69}


 27%|██▋       | 825/3060 [05:37<13:35,  2.74it/s]

{'loss': 0.2143, 'grad_norm': 0.5948917269706726, 'learning_rate': 0.00017138107707701433, 'epoch': 16.18}


 28%|██▊       | 850/3060 [05:47<14:03,  2.62it/s]

{'loss': 0.1867, 'grad_norm': 0.8924103379249573, 'learning_rate': 0.0001695030523597374, 'epoch': 16.67}


 29%|██▊       | 875/3060 [05:56<13:40,  2.66it/s]

{'loss': 0.1757, 'grad_norm': 1.1314976215362549, 'learning_rate': 0.0001675763610986817, 'epoch': 17.16}


 29%|██▉       | 900/3060 [06:06<13:35,  2.65it/s]

{'loss': 0.1637, 'grad_norm': 0.6919769048690796, 'learning_rate': 0.0001656023523771095, 'epoch': 17.65}


 30%|███       | 925/3060 [06:15<13:32,  2.63it/s]

{'loss': 0.1629, 'grad_norm': 1.6178779602050781, 'learning_rate': 0.00016358240841031352, 'epoch': 18.14}


 31%|███       | 950/3060 [06:25<13:08,  2.68it/s]

{'loss': 0.1503, 'grad_norm': 0.6058648824691772, 'learning_rate': 0.00016151794357778006, 'epoch': 18.63}


 32%|███▏      | 975/3060 [06:34<13:40,  2.54it/s]

{'loss': 0.1523, 'grad_norm': 0.41261592507362366, 'learning_rate': 0.00015941040343282997, 'epoch': 19.12}


 33%|███▎      | 1000/3060 [06:44<13:57,  2.46it/s]

{'loss': 0.144, 'grad_norm': 0.8192616701126099, 'learning_rate': 0.00015726126369043191, 'epoch': 19.61}


 33%|███▎      | 1025/3060 [06:53<12:56,  2.62it/s]

{'loss': 0.1515, 'grad_norm': 1.0715988874435425, 'learning_rate': 0.00015507202919389572, 'epoch': 20.1}


 34%|███▍      | 1050/3060 [07:03<12:37,  2.65it/s]

{'loss': 0.1459, 'grad_norm': 0.6387449502944946, 'learning_rate': 0.00015284423286117033, 'epoch': 20.59}


 35%|███▌      | 1075/3060 [07:12<12:25,  2.66it/s]

{'loss': 0.1438, 'grad_norm': 0.30082210898399353, 'learning_rate': 0.0001505794346114834, 'epoch': 21.08}


 36%|███▌      | 1100/3060 [07:22<12:32,  2.61it/s]

{'loss': 0.1305, 'grad_norm': 0.7128223776817322, 'learning_rate': 0.00014827922027307451, 'epoch': 21.57}


 37%|███▋      | 1125/3060 [07:31<12:22,  2.61it/s]

{'loss': 0.1434, 'grad_norm': 0.4769172668457031, 'learning_rate': 0.00014594520047278662, 'epoch': 22.06}


 38%|███▊      | 1150/3060 [07:40<12:13,  2.60it/s]

{'loss': 0.1327, 'grad_norm': 0.9244122505187988, 'learning_rate': 0.0001435790095082935, 'epoch': 22.55}


 38%|███▊      | 1175/3060 [07:50<12:20,  2.55it/s]

{'loss': 0.1344, 'grad_norm': 0.34200817346572876, 'learning_rate': 0.00014118230420375258, 'epoch': 23.04}


 39%|███▉      | 1200/3060 [08:00<11:49,  2.62it/s]

{'loss': 0.1293, 'grad_norm': 0.5858622193336487, 'learning_rate': 0.0001387567627496845, 'epoch': 23.53}


 40%|████      | 1225/3060 [08:09<11:36,  2.64it/s]

{'loss': 0.1354, 'grad_norm': 0.2988518476486206, 'learning_rate': 0.000136304083527892, 'epoch': 24.02}


 41%|████      | 1250/3060 [08:18<11:56,  2.53it/s]

{'loss': 0.126, 'grad_norm': 0.3134332299232483, 'learning_rate': 0.00013382598392224052, 'epoch': 24.51}


 42%|████▏     | 1275/3060 [08:28<10:33,  2.82it/s]

{'loss': 0.1351, 'grad_norm': 0.26239413022994995, 'learning_rate': 0.0001313241991161336, 'epoch': 25.0}


 42%|████▏     | 1300/3060 [08:37<11:29,  2.55it/s]

{'loss': 0.1228, 'grad_norm': 0.6165054440498352, 'learning_rate': 0.00012880048087752458, 'epoch': 25.49}


 43%|████▎     | 1325/3060 [08:47<10:29,  2.76it/s]

{'loss': 0.1294, 'grad_norm': 0.27439171075820923, 'learning_rate': 0.0001262565963323163, 'epoch': 25.98}


 44%|████▍     | 1350/3060 [08:56<09:19,  3.06it/s]

{'loss': 0.1206, 'grad_norm': 0.2858702838420868, 'learning_rate': 0.00012369432672700633, 'epoch': 26.47}


 45%|████▍     | 1375/3060 [09:06<11:08,  2.52it/s]

{'loss': 0.1271, 'grad_norm': 1.5910996198654175, 'learning_rate': 0.00012111546618144529, 'epoch': 26.96}


 46%|████▌     | 1400/3060 [09:16<10:06,  2.74it/s]

{'loss': 0.1158, 'grad_norm': 0.3061148226261139, 'learning_rate': 0.00011852182043258111, 'epoch': 27.45}


 47%|████▋     | 1425/3060 [09:25<09:31,  2.86it/s]

{'loss': 0.1271, 'grad_norm': 0.26312389969825745, 'learning_rate': 0.00011591520557006885, 'epoch': 27.94}


 47%|████▋     | 1450/3060 [09:34<09:55,  2.71it/s]

{'loss': 0.1188, 'grad_norm': 0.27801769971847534, 'learning_rate': 0.00011329744676463143, 'epoch': 28.43}


 48%|████▊     | 1475/3060 [09:44<09:57,  2.65it/s]

{'loss': 0.1217, 'grad_norm': 0.28262442350387573, 'learning_rate': 0.0001106703769900618, 'epoch': 28.92}


 49%|████▉     | 1500/3060 [09:54<09:57,  2.61it/s]

{'loss': 0.1157, 'grad_norm': 0.2023785263299942, 'learning_rate': 0.00010803583573976138, 'epoch': 29.41}


 50%|████▉     | 1525/3060 [10:03<09:40,  2.65it/s]

{'loss': 0.1204, 'grad_norm': 0.26079174876213074, 'learning_rate': 0.00010539566773871332, 'epoch': 29.9}


 51%|█████     | 1550/3060 [10:13<09:40,  2.60it/s]

{'loss': 0.1143, 'grad_norm': 0.2595822215080261, 'learning_rate': 0.00010275172165179268, 'epoch': 30.39}


 51%|█████▏    | 1575/3060 [10:23<09:25,  2.63it/s]

{'loss': 0.1184, 'grad_norm': 0.22340944409370422, 'learning_rate': 0.0001001058487893178, 'epoch': 30.88}


 52%|█████▏    | 1600/3060 [10:32<09:17,  2.62it/s]

{'loss': 0.1121, 'grad_norm': 0.2906337082386017, 'learning_rate': 9.74599018107492e-05, 'epoch': 31.37}


 53%|█████▎    | 1625/3060 [10:42<09:06,  2.63it/s]

{'loss': 0.1171, 'grad_norm': 0.22232216596603394, 'learning_rate': 9.481573342744419e-05, 'epoch': 31.86}


 54%|█████▍    | 1650/3060 [10:51<08:57,  2.62it/s]

{'loss': 0.1118, 'grad_norm': 0.1994546502828598, 'learning_rate': 9.217519510537456e-05, 'epoch': 32.35}


 55%|█████▍    | 1675/3060 [11:00<09:01,  2.56it/s]

{'loss': 0.1133, 'grad_norm': 0.23864911496639252, 'learning_rate': 8.954013576871691e-05, 'epoch': 32.84}


 56%|█████▌    | 1700/3060 [11:10<08:51,  2.56it/s]

{'loss': 0.1126, 'grad_norm': 0.2187819629907608, 'learning_rate': 8.691240050522216e-05, 'epoch': 33.33}


 56%|█████▋    | 1725/3060 [11:19<08:15,  2.69it/s]

{'loss': 0.1141, 'grad_norm': 0.20704321563243866, 'learning_rate': 8.429382927427199e-05, 'epoch': 33.82}


 57%|█████▋    | 1750/3060 [11:28<08:01,  2.72it/s]

{'loss': 0.1116, 'grad_norm': 0.18179135024547577, 'learning_rate': 8.168625561852559e-05, 'epoch': 34.31}


 58%|█████▊    | 1775/3060 [11:38<08:47,  2.43it/s]

{'loss': 0.1121, 'grad_norm': 0.1639907956123352, 'learning_rate': 7.90915053800599e-05, 'epoch': 34.8}


 59%|█████▉    | 1800/3060 [11:47<08:07,  2.59it/s]

{'loss': 0.1095, 'grad_norm': 0.20370130240917206, 'learning_rate': 7.651139542190164e-05, 'epoch': 35.29}


 60%|█████▉    | 1825/3060 [11:57<08:12,  2.51it/s]

{'loss': 0.1121, 'grad_norm': 0.19303682446479797, 'learning_rate': 7.394773235584651e-05, 'epoch': 35.78}


 60%|██████    | 1850/3060 [12:06<08:10,  2.47it/s]

{'loss': 0.1103, 'grad_norm': 0.23137012124061584, 'learning_rate': 7.14023112774566e-05, 'epoch': 36.27}


 61%|██████▏   | 1875/3060 [12:16<07:44,  2.55it/s]

{'loss': 0.1086, 'grad_norm': 0.20482197403907776, 'learning_rate': 6.887691450912112e-05, 'epoch': 36.76}


 62%|██████▏   | 1900/3060 [12:25<07:30,  2.58it/s]

{'loss': 0.1114, 'grad_norm': 0.2194482386112213, 'learning_rate': 6.637331035206166e-05, 'epoch': 37.25}


 63%|██████▎   | 1925/3060 [12:35<07:45,  2.44it/s]

{'loss': 0.1086, 'grad_norm': 0.21054568886756897, 'learning_rate': 6.389325184815438e-05, 'epoch': 37.75}


 64%|██████▎   | 1950/3060 [12:44<06:22,  2.90it/s]

{'loss': 0.1104, 'grad_norm': 0.2558169662952423, 'learning_rate': 6.14384755524377e-05, 'epoch': 38.24}


 65%|██████▍   | 1975/3060 [12:53<06:34,  2.75it/s]

{'loss': 0.1089, 'grad_norm': 0.26625439524650574, 'learning_rate': 5.9010700317163404e-05, 'epoch': 38.73}


 65%|██████▌   | 2000/3060 [13:03<06:27,  2.74it/s]

{'loss': 0.1091, 'grad_norm': 0.22427505254745483, 'learning_rate': 5.6611626088244194e-05, 'epoch': 39.22}


 66%|██████▌   | 2025/3060 [13:13<06:18,  2.74it/s]

{'loss': 0.1071, 'grad_norm': 0.27342817187309265, 'learning_rate': 5.424293271493881e-05, 'epoch': 39.71}


 67%|██████▋   | 2050/3060 [13:23<06:39,  2.53it/s]

{'loss': 0.1081, 'grad_norm': 0.21909743547439575, 'learning_rate': 5.190627877360953e-05, 'epoch': 40.2}


 68%|██████▊   | 2075/3060 [13:33<06:13,  2.63it/s]

{'loss': 0.108, 'grad_norm': 0.24414531886577606, 'learning_rate': 4.960330040637493e-05, 'epoch': 40.69}


 69%|██████▊   | 2100/3060 [13:43<06:24,  2.50it/s]

{'loss': 0.106, 'grad_norm': 0.218270406126976, 'learning_rate': 4.7335610175471036e-05, 'epoch': 41.18}


 69%|██████▉   | 2125/3060 [13:53<06:19,  2.46it/s]

{'loss': 0.1055, 'grad_norm': 0.22200289368629456, 'learning_rate': 4.510479593412383e-05, 'epoch': 41.67}


 70%|███████   | 2150/3060 [14:03<05:56,  2.55it/s]

{'loss': 0.1086, 'grad_norm': 0.2111799716949463, 'learning_rate': 4.2912419714722496e-05, 'epoch': 42.16}


 71%|███████   | 2175/3060 [14:12<05:35,  2.64it/s]

{'loss': 0.105, 'grad_norm': 0.19801850616931915, 'learning_rate': 4.076001663507325e-05, 'epoch': 42.65}


 72%|███████▏  | 2200/3060 [14:22<05:24,  2.65it/s]

{'loss': 0.107, 'grad_norm': 0.211821511387825, 'learning_rate': 3.864909382349849e-05, 'epoch': 43.14}


 73%|███████▎  | 2225/3060 [14:32<05:34,  2.49it/s]

{'loss': 0.1057, 'grad_norm': 0.21828190982341766, 'learning_rate': 3.6581129363534636e-05, 'epoch': 43.63}


 74%|███████▎  | 2250/3060 [14:41<05:28,  2.46it/s]

{'loss': 0.1062, 'grad_norm': 0.16990147531032562, 'learning_rate': 3.455757125896725e-05, 'epoch': 44.12}


 74%|███████▍  | 2275/3060 [14:51<05:19,  2.46it/s]

{'loss': 0.1033, 'grad_norm': 0.20619824528694153, 'learning_rate': 3.257983641992813e-05, 'epoch': 44.61}


 75%|███████▌  | 2300/3060 [15:01<05:09,  2.46it/s]

{'loss': 0.1082, 'grad_norm': 0.19844424724578857, 'learning_rate': 3.064930967076477e-05, 'epoch': 45.1}


 76%|███████▌  | 2325/3060 [15:11<05:04,  2.41it/s]

{'loss': 0.1043, 'grad_norm': 0.23884986340999603, 'learning_rate': 2.8767342780375926e-05, 'epoch': 45.59}


 77%|███████▋  | 2350/3060 [15:22<05:00,  2.37it/s]

{'loss': 0.1062, 'grad_norm': 0.18344789743423462, 'learning_rate': 2.693525351569347e-05, 'epoch': 46.08}


 78%|███████▊  | 2375/3060 [15:32<04:47,  2.38it/s]

{'loss': 0.1035, 'grad_norm': 0.20458543300628662, 'learning_rate': 2.515432471897221e-05, 'epoch': 46.57}


 78%|███████▊  | 2400/3060 [15:42<04:38,  2.37it/s]

{'loss': 0.1061, 'grad_norm': 0.1660047322511673, 'learning_rate': 2.3425803409534508e-05, 'epoch': 47.06}


 79%|███████▉  | 2425/3060 [15:52<04:32,  2.33it/s]

{'loss': 0.1024, 'grad_norm': 0.2324872612953186, 'learning_rate': 2.1750899910598087e-05, 'epoch': 47.55}


 80%|████████  | 2450/3060 [16:02<04:08,  2.45it/s]

{'loss': 0.1067, 'grad_norm': 0.1865738332271576, 'learning_rate': 2.01307870017991e-05, 'epoch': 48.04}


 81%|████████  | 2475/3060 [16:13<04:06,  2.37it/s]

{'loss': 0.1035, 'grad_norm': 0.1959274709224701, 'learning_rate': 1.856659909800318e-05, 'epoch': 48.53}


 82%|████████▏ | 2500/3060 [16:23<03:44,  2.49it/s]

{'loss': 0.1048, 'grad_norm': 0.19315871596336365, 'learning_rate': 1.7059431454979824e-05, 'epoch': 49.02}


 83%|████████▎ | 2525/3060 [16:33<03:42,  2.40it/s]

{'loss': 0.102, 'grad_norm': 0.20450405776500702, 'learning_rate': 1.5610339402496476e-05, 'epoch': 49.51}


 83%|████████▎ | 2550/3060 [16:43<03:14,  2.63it/s]

{'loss': 0.1057, 'grad_norm': 0.3233512043952942, 'learning_rate': 1.4220337605368816e-05, 'epoch': 50.0}


 84%|████████▍ | 2575/3060 [16:54<03:27,  2.34it/s]

{'loss': 0.1009, 'grad_norm': 0.2106303870677948, 'learning_rate': 1.2890399352985094e-05, 'epoch': 50.49}


 85%|████████▍ | 2600/3060 [17:04<03:00,  2.55it/s]

{'loss': 0.105, 'grad_norm': 0.22544576227664948, 'learning_rate': 1.1621455877801757e-05, 'epoch': 50.98}


 86%|████████▌ | 2625/3060 [17:14<02:34,  2.81it/s]

{'loss': 0.1041, 'grad_norm': 0.252002090215683, 'learning_rate': 1.0414395703287849e-05, 'epoch': 51.47}


 87%|████████▋ | 2650/3060 [17:24<02:36,  2.62it/s]

{'loss': 0.1015, 'grad_norm': 0.198775976896286, 'learning_rate': 9.270064021774194e-06, 'epoch': 51.96}


 87%|████████▋ | 2675/3060 [17:34<02:19,  2.76it/s]

{'loss': 0.1024, 'grad_norm': 0.2487664520740509, 'learning_rate': 8.189262102643746e-06, 'epoch': 52.45}


 88%|████████▊ | 2700/3060 [17:44<02:09,  2.77it/s]

{'loss': 0.1035, 'grad_norm': 0.2417977899312973, 'learning_rate': 7.172746731276769e-06, 'epoch': 52.94}


 89%|████████▉ | 2725/3060 [17:54<02:06,  2.64it/s]

{'loss': 0.101, 'grad_norm': 0.2261212170124054, 'learning_rate': 6.221229679144414e-06, 'epoch': 53.43}


 90%|████████▉ | 2750/3060 [18:03<01:51,  2.77it/s]

{'loss': 0.1029, 'grad_norm': 0.22063425183296204, 'learning_rate': 5.335377205420911e-06, 'epoch': 53.92}


 91%|█████████ | 2775/3060 [18:13<01:50,  2.58it/s]

{'loss': 0.101, 'grad_norm': 0.2392674833536148, 'learning_rate': 4.515809590464159e-06, 'epoch': 54.41}


 92%|█████████▏| 2800/3060 [18:23<01:50,  2.36it/s]

{'loss': 0.1026, 'grad_norm': 0.20660385489463806, 'learning_rate': 3.763100701490929e-06, 'epoch': 54.9}


 92%|█████████▏| 2825/3060 [18:33<01:37,  2.40it/s]

{'loss': 0.1022, 'grad_norm': 0.274705171585083, 'learning_rate': 3.0777775907507632e-06, 'epoch': 55.39}


 93%|█████████▎| 2850/3060 [18:44<01:28,  2.38it/s]

{'loss': 0.1018, 'grad_norm': 0.21628373861312866, 'learning_rate': 2.460320126480242e-06, 'epoch': 55.88}


 94%|█████████▍| 2875/3060 [18:54<01:15,  2.46it/s]

{'loss': 0.1029, 'grad_norm': 0.19541339576244354, 'learning_rate': 1.9111606568956497e-06, 'epoch': 56.37}


 95%|█████████▍| 2900/3060 [19:04<01:04,  2.49it/s]

{'loss': 0.1013, 'grad_norm': 0.22163191437721252, 'learning_rate': 1.4306837074597235e-06, 'epoch': 56.86}


 96%|█████████▌| 2925/3060 [19:15<00:54,  2.48it/s]

{'loss': 0.1007, 'grad_norm': 0.22889012098312378, 'learning_rate': 1.0192257116340197e-06, 'epoch': 57.35}


 96%|█████████▋| 2950/3060 [19:25<00:43,  2.54it/s]

{'loss': 0.1016, 'grad_norm': 0.24244970083236694, 'learning_rate': 6.770747753057749e-07, 'epoch': 57.84}


 97%|█████████▋| 2975/3060 [19:35<00:36,  2.31it/s]

{'loss': 0.1013, 'grad_norm': 0.23279084265232086, 'learning_rate': 4.044704750541084e-07, 'epoch': 58.33}


 98%|█████████▊| 3000/3060 [19:47<00:27,  2.17it/s]

{'loss': 0.1014, 'grad_norm': 0.2306096851825714, 'learning_rate': 2.0160369039670113e-07, 'epoch': 58.82}


 99%|█████████▉| 3025/3060 [19:56<00:14,  2.41it/s]

{'loss': 0.1028, 'grad_norm': 0.18696337938308716, 'learning_rate': 6.861647013461925e-08, 'epoch': 59.31}


100%|█████████▉| 3050/3060 [20:06<00:04,  2.43it/s]

{'loss': 0.1028, 'grad_norm': 0.2276427447795868, 'learning_rate': 5.601932888632533e-09, 'epoch': 59.8}


100%|██████████| 3060/3060 [20:13<00:00,  2.52it/s]

{'train_runtime': 1213.2627, 'train_samples_per_second': 9.99, 'train_steps_per_second': 2.522, 'train_loss': 0.326123194795808, 'epoch': 60.0}





TrainOutput(global_step=3060, training_loss=0.326123194795808, metrics={'train_runtime': 1213.2627, 'train_samples_per_second': 9.99, 'train_steps_per_second': 2.522, 'total_flos': 1.5212301025456128e+16, 'train_loss': 0.326123194795808, 'epoch': 60.0})

In [11]:
trainer.model.save_pretrained(new_model)

'''python
<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 22 Nov 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow are A checks structured under Revision 28 of the A320’s MPD?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA checks are grouped as a generic check and typically performed every 450 FH.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

In [None]:
question = "What is the range of seating capacities for the A320 family?"

prompt = (
    "<|begin_of_text|>"                              # start of prompt
    "<|start_header_id|>user<|end_header_id|>"       # user header
    f"{question}"                                    # user input
    "<|eot_id|>"                                     #end of turn
    "<|start_header_id|>assistant<|end_header_id|>"  #assistant header
)

print(prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>How are A checks structured under Revision 28 of the A320’s MPD?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [13]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>How are A checks structured under Revision 28 of the A320’s MPD?<|eot_id|><|start_header_id|>assistant<|end_header_id|>How are A checks structured under Revision 28?


In [14]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>How are A checks structured under Revision 28 of the A320’s MPD?<|eot_id|><|start_header_id|>assistant<|end_header_id|>How are A checks structured under Revision 28?


In [15]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]


In [None]:
# trainer.model.save_pretrained('/media/hamna/New Volume/pdf_fine_tune_llama_work/fine_tune_llama_32_3b_pdf/fine_tune_model_weights_60epochs')

In [16]:

import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# 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

In [17]:
model.push_to_hub("Hassan883/llama_32_3b_fine_tune_model_50epoch_1", check_pr=True)

tokenizer.push_to_hub("Hassan883/llama_32_3b_fine_tune_model_50epoch_1",check_pr=True)

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]
model-00002-of-00002.safetensors: 100%|██████████| 1.46G/1.46G [10:15<00:00, 2.37MB/s] 

model-00001-of-00002.safetensors: 100%|██████████| 4.97G/4.97G [25:05<00:00, 3.30MB/s]

Upload 2 LFS files: 100%|██████████| 2/2 [25:05<00:00, 752.94s/it]
tokenizer.json: 100%|██████████| 17.2M/17.2M [00:08<00:00, 1.99MB/s]


CommitInfo(commit_url='https://huggingface.co/Hassan883/llama_32_3b_fine_tune_model_50epoch_1/commit/c24181ea42d796f37fde14e9b1398698ea149e40', commit_message='Upload tokenizer', commit_description='', oid='c24181ea42d796f37fde14e9b1398698ea149e40', pr_url=None, pr_revision=None, pr_num=None)

# Inferencing on finetuned model

In [6]:
model_name = 'Hassan883/llama_32_3b_fine_tune_model_50epoch_1'
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = use_4bit,
    bnb_4bit_quant_type = bnb_4bit_quant_type,
    bnb_4bit_computer_dtype = compute_dtype,
    bnb_4bit_use_double_quant = use_nested_quant
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 8)
        print(" Your GPU supports bfloat16: accelearate training with bf16=True")
        print("=" * 80)

#Loading the base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config  =  bnb_config,
    device_map = device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1

#Now Loading the Llama Model architecture Tokenizier
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Unused kwargs: ['bnb_4bit_computer_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


 Your GPU supports bfloat16: accelearate training with bf16=True


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]


In [19]:
question = "What is the range of seating capacities for the A320 family?"

prompt = (
    "<|begin_of_text|>"                              # start of prompt
    "<|start_header_id|>user<|end_header_id|>"       # user header
    f"{question}"                                    # user input
    "<|eot_id|>"                                     #end of turn
    "<|start_header_id|>assistant<|end_header_id|>"  #assistant header
)

print(prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the range of seating capacities for the A320 family?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [25]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
def inference_func(model, tokenizer,prompt, max_seq_length=200):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    print(result[0]['generated_text'])

In [32]:
def prompt_template_func(question):
    question = question

    prompt = (
        "<|begin_of_text|>"                              # start of prompt
        "<|start_header_id|>user<|end_header_id|>"       # user header
        f"{question}"                                    # user input
        "<|eot_id|>"                                     #end of turn
        "<|start_header_id|>assistant<|end_header_id|>"  #assistant header
    )

    print(prompt)
    return prompt

In [27]:
inference_func(model=model,tokenizer=tokenizer, prompt=prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is the initial process for clearing technical defects in the A320?<|eot_id|><|start_header_id|>assistant<|end_header_id|>The process starts with logging and troubleshooting, followed by either clearing or deferring the defects.


In [35]:
test_dataset

Dataset({
    features: ['questions', 'answers', 'prompt'],
    num_rows: 23
})

In [11]:
test_dataset['questions']

['What is the estimated man-hour (MH) expenditure for pre-flight checks per year?',
 'How does the maintenance cost differ for the CFM56-5B and V.2500-A5 engines?',
 'What is the initial process for clearing technical defects in the A320?',
 'What type of pilot rating do A320 family variants share?',
 'How are A checks structured under Revision 28 of the A320’s MPD?',
 'What is the average cost for materials and consumables for weekly checks?',
 'What is the fuel capacity of the A320-200?',
 'Who was the first customer to place an order for the A320?',
 'How much additional fuel does the A319 burn compared to the A321 per passenger?',
 'What was the route used to analyze fuel burn performance?',
 'What MTOWs were analyzed for the A320?',
 'What is the estimated MH consumption for daily checks?',
 'Which engine variants power the A320 family?',
 'What is the reserve cost per engine flight hour (EFH) for the CFM56-5B powering the A320?',
 'How many passengers were analyzed on the A319 du

In [12]:
test_dataset['answers']

['About 180 MH per year.',
 'Maintenance costs can vary based on the engine type, operational conditions, and maintenance intervals.',
 'The process starts with logging and troubleshooting, followed by either clearing or deferring the defects.',
 'They have a common pilot type rating.',
 'A checks are grouped as a generic check and typically performed every 450 FH.',
 'Approximately $42,000 per year.',
 'The fuel capacity is 6,300 USG.',
 'Air France signed a letter of intent for 25 aircraft in 1981.',
 'About 1.5 US gallons more per passenger.',
 'The London Heathrow to Munich route.',
 '166,450lbs (75.5 tonnes) and 169,800lbs (77 tonnes).',
 'Approximately 1,250 MH annually.',
 'The A320 family is powered by CFM56-5A, CFM56-5B, and V.2500 engines.',
 'The reserve cost for the CFM56-5B is approximately $56 per EFH.',
 '124 passengers.',
 'It utilizes an onboard fault detection and analysis system and transmits fault messages to ground stations.',
 'A 2005 A321-200 can realize a lease 

In [13]:
question = 'How often are TR checks typically performed for the A320?'

prompt = prompt_template_func(question=question)

NameError: name 'prompt_template_func' is not defined

In [34]:
inference_func(model=model, tokenizer=tokenizer, prompt=prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>How often are TR checks typically performed for the A320?<|eot_id|><|start_header_id|>assistant<|end_header_id|>How often are TR checks typically performed for the A320? TR checks are typically performed every 5-6 months.


In [43]:
from datasets import Dataset
from transformers import pipeline
import logging
import re
# from datasets import load_metric
# Function for inference
def inference_func(model, tokenizer, prompt, max_seq_length=200):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    return result[0]['generated_text']

# Function to preprocess test dataset
def preprocess_prompt(question):
    prompt = (
        "<|begin_of_text|>"                               # Start of prompt
        "<|start_header_id|>system<|end_header_id|>\n\n"  # System role
        "Please answer the question factually and concisely.\n\n"  # Instruction
        "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"  # User header
        f"{question}\n\n"                                 # User input
        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # Assistant header
    )
    return prompt


# Load test dataset (Replace 'your_test_dataset' with your actual dataset)
# test_dataset = Dataset.from_dict({
#     "questions": [
#         "What engines are available for the A318?",
#         "What is the first step in the process of clearing technical defects?",
#         # Add other test questions here...
#     ],
#     "answers": [
#         "The A318 utilizes the CFM56-5B series and the PW6000 series.",
#         "Logging and troubleshooting the defects.",
#         # Add corresponding ground truth answers here...
#     ]
# })

# Evaluate the model on the test dataset
# Load F1 metric



In [51]:
from datasets import Dataset, Value
from transformers import pipeline, AutoTokenizer
import logging
import re
import evaluate

# Function for inference
def inference_func(model, tokenizer, prompt, max_seq_length=200):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    return result[0]['generated_text']

# Function to preprocess test dataset
def preprocess_prompt(question):
    prompt = (
        "<|begin_of_text|>"                               # Start of prompt
        "<|start_header_id|>system<|end_header_id|>\n\n"  # System role
        "Please answer the question factually and concisely.\n\n"  # Instruction
        "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"  # User header
        f"{question}\n\n"                                 # User input
        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # Assistant header
    )
    return prompt

# F1 metric
f1_metric = evaluate.load("f1")

# Normalize function for consistent string comparison
def normalize_answer(answer):
    """Normalize text for fair comparison."""
    if not isinstance(answer, str):
        answer = str(answer)  # Convert non-string types to strings
    answer = answer.lower().strip()
    answer = re.sub(r"[^a-z0-9\s]", "", answer)  # Remove special characters
    answer = re.sub(r"\s+", " ", answer)        # Normalize spaces
    return answer

# Function to clean and validate the dataset
def clean_dataset(dataset):
    """Ensure all entries in the dataset are strings."""
    cleaned_questions = []
    cleaned_answers = []
    for question, answer in zip(dataset["questions"], dataset["answers"]):
        if not isinstance(question, str):
            question = str(question)  # Convert to string if not already
        if not isinstance(answer, str):
            answer = str(answer)  # Convert to string if not already
        cleaned_questions.append(question.strip())
        cleaned_answers.append(answer.strip())
    return Dataset.from_dict({"questions": cleaned_questions, "answers": cleaned_answers})

# Updated evaluation function
def evaluate_model_with_f1(model, tokenizer, test_dataset):
    predictions = []
    references = []
    
    for example in test_dataset:
        # Prepare the prompt
        prompt = preprocess_prompt(example["questions"])

        # Generate the model's prediction
        generated_text = inference_func(model, tokenizer, prompt, max_seq_length=200)

        # Extract and clean prediction
        response_start = "<|start_header_id|>assistant<|end_header_id|>"
        if response_start in generated_text:
            prediction = generated_text.split(response_start, 1)[1].strip()
        else:
            prediction = generated_text.strip()

        predictions.append(prediction)
        references.append(example["answers"])

    # Validate data types
    for pred, ref in zip(predictions, references):
        print(f"Prediction: {pred} ({type(pred)}), Reference: {ref} ({type(ref)})")

test_dataset = clean_dataset(test_dataset)

# Ensure dataset columns are properly typed
test_dataset_t = test_dataset.cast_column("questions", Value("string"))
test_dataset_t = test_dataset.cast_column("answers", Value("string"))

# Example usage (Replace `model` and `tokenizer` with your fine-tuned LLaMA model and tokenizer)
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained("your_model_path")
# tokenizer = AutoTokenizer.from_pretrained("your_model_path")

# Run evaluation
evaluate_model_with_f1(model, tokenizer, test_dataset_t)


Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 5714.31 examples/s]
Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 5503.39 examples/s]


Prediction: TR checks are typically performed every 5-6 months. (<class 'str'>), Reference: TR checks are performed about 1,480 times a year. (<class 'str'>)
Prediction: There are 97 A320s powered by CFM56-5B4 engines. (<class 'str'>), Reference: 766 aircraft are powered by CFM56-5B4 engines. (<class 'str'>)
Prediction: The A320 family has delivered over 1,700 aircraft until now. (<class 'str'>), Reference: More than 2,600 aircraft have been delivered. (<class 'str'>)
Prediction: The A318 received 95 orders, while the 737-600 received 127. (<class 'str'>), Reference: The A318's performance, while disappointing, still exceeds that of the 737-600. (<class 'str'>)
Prediction: The supplementary fuel tanks increase the total capacity to 7,000 USG. (<class 'str'>), Reference: The total capacity can be increased to 7,066 USG. (<class 'str'>)
Prediction: Approximately 180 MH annually. (<class 'str'>), Reference: Around 520 MH per year. (<class 'str'>)
Prediction: The CFM56-5B and V.2500-A5 ser

In [21]:
from datasets import Dataset, Value
from transformers import pipeline, AutoTokenizer
import re
import evaluate

# Function for inference
def inference_func(model, tokenizer, prompt, max_seq_length=200):
    """Generate text using the model and tokenizer."""
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_seq_length)
    result = pipe(prompt)
    return result[0]['generated_text']

# Function to preprocess a question into the desired prompt format
def preprocess_prompt(question):
    """Format a question into the desired prompt template."""
    prompt = (
        "<|begin_of_text|>"                               # Start of prompt
        "<|start_header_id|>system<|end_header_id|>\n\n"  # System role
        "Please answer the question factually and concisely.\n\n"  # Instruction
        "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"  # User header
        f"{question}\n\n"                                 # User input
        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # Assistant header
    )
    return prompt

# Normalize function for consistent string comparison
def normalize_answer(answer):
    """Normalize text for fair comparison."""
    if not isinstance(answer, str):
        answer = str(answer)  # Convert non-string types to strings
    answer = answer.lower().strip()
    answer = re.sub(r"[^a-z0-9\s]", "", answer)  # Remove special characters
    answer = re.sub(r"\s+", " ", answer)        # Normalize spaces
    return answer

# Function to clean and validate the dataset
def clean_dataset(dataset):
    """Ensure all entries in the dataset are strings."""
    cleaned_questions = []
    cleaned_answers = []
    for question, answer in zip(dataset["questions"], dataset["answers"]):
        if not isinstance(question, str):
            question = str(question)  # Convert to string if not already
        if not isinstance(answer, str):
            answer = str(answer)  # Convert to string if not already
        cleaned_questions.append(question.strip())
        cleaned_answers.append(answer.strip())
    return Dataset.from_dict({"questions": cleaned_questions, "answers": cleaned_answers})

# Updated inference function to handle unknown answers
def inference_with_default_response(model, tokenizer, question, max_seq_length=200):
    """Generate a response and return 'Sorry, I don't know' if the model does not provide a valid answer."""
    prompt = preprocess_prompt(question)
    generated_text = inference_func(model, tokenizer, prompt, max_seq_length=max_seq_length)

    # Extract the assistant's response
    response_start = "<|start_header_id|>assistant<|end_header_id|>"
    if response_start in generated_text:
        prediction = generated_text.split(response_start, 1)[1].strip()
    else:
        prediction = generated_text.strip()

    # Handle cases where the model cannot generate a response
    if not prediction or prediction.lower() in ["i don't know", "i am not sure", ""]:
        return "Sorry, I don't know."
    return prediction

# # Example Dataset (Replace this with your actual dataset)
# test_dataset = Dataset.from_dict({
#     "questions": [
#         "What engines are available for the A318?",
#         "What is the first step in the process of clearing technical defects?",
#         "How long is the wingspan of the A320?",
#         "What is the purpose of life?"  # Example for a question the model might not know
#     ],
#     "answers": [
#         "The A318 utilizes the CFM56-5B series and the PW6000 series.",
#         "Logging and troubleshooting the defects.",
#         "The wingspan of the A320 is approximately 34.1 meters.",
#         "Sorry, I don't know."
#     ],
# })

# Clean and validate dataset
test_dataset = clean_dataset(test_dataset)

# Ensure dataset columns are properly typed
test_dataset = test_dataset.cast_column("questions", Value("string"))
test_dataset = test_dataset.cast_column("answers", Value("string"))

# # Example usage (Replace `model` and `tokenizer` with your fine-tuned LLaMA model and tokenizer)
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained("your_model_path")
# tokenizer = AutoTokenizer.from_pretrained("your_model_path")

# Testing the model on questions
for idx, example in enumerate(test_dataset):
    question = example["questions"]
    expected_answer = example["answers"]
    generated_answer = inference_with_default_response(model, tokenizer, question)
    print(f"Q{idx+1}: {question}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Expected Answer: {expected_answer}")
    print("-" * 30)


Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 4848.42 examples/s]
Casting the dataset: 100%|██████████| 23/23 [00:00<00:00, 5542.60 examples/s]




Q1: What thrust ratings are available for the CFM56-5B engines on the A321?
Generated Answer: The thrust ratings are -5B4 (27,000 lbs), -5B1 (30,000 lbs), and -5B2 (31,000 lbs).
Expected Answer: The thrust ratings include -5B4 (27,000 lbs), -5B1 (30,000 lbs), -5B2 (31,000 lbs), and -5B3 (33,000 lbs).
------------------------------
Q2: How are post-flight technical logs maintained?
Generated Answer: Flight crew log ECAM messages in the post-flight technical log.
Expected Answer: Flight crew log ECAM messages in the post-flight technical log.
------------------------------
Q3: How many MH are typically consumed during a C8 check?
Generated Answer: About 818 MH.
Expected Answer: A C8 check can consume around 20,000 MH, including various tasks.
------------------------------
Q4: How does the A321’s seating capacity compare to the 737-900ER?
Generated Answer: The A321 has five more seats than the 737-900ER.
Expected Answer: The A321 has five more seats than the 737-900ER.
------------------