In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset


In [2]:
model_path = r"d:\models\llama8b"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# Define LoRA adapter config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # LLaMA convention
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# Attach LoRA adapters
model = get_peft_model(model, lora_config)

print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_fe

In [4]:
import json
from torch.utils.data import Dataset, DataLoader
class EmailDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["input_text"]
        encodings = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].squeeze(0)
        attention_mask = encodings["attention_mask"].squeeze(0)
        # Labels = input_ids for self-supervised LM
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


In [None]:
#prompt template all the mail contents are passed onto the prompt 


prompt_template ="""
You are a cargo email information extraction assistant. Your job is to extract structured JSON data from a single airline cargo-related email.
Subject: {subject}
From: {from_}
To: {to}
Body:{body}
Your task is to extract only the entities listed below from this one email. Output a clean, valid **JSON array of dictionaries**, where each dictionary corresponds to one AWB (Air Waybill). Do **not** generate anything outside the JSON.
Each dictionary should contain only the following fields:
[
    {{
        "AWB": "",
        "FlightNo": "",
        "Departure-date": "",
        "total-pieces": ,
        "pieces@dimensions": [""],
        "dimension-unit": [""],
        "Weight": ,
        "weight-unit": "",
        "special-instruction": "",
        "commodity-description": "",
        "product-code": "",
        "Source": "",
        "Destination": ""
    }}
]

### EXTRACTION RULES:
AWB (Air Waybill):
Must be 11-digit numbers starting with valid airline prefixes: <AWB_PREFIX>.
May be referred to as "MAWB" or "GUIA".
Remove hyphens or spaces.
One dictionary per AWB; multiple AWBs = multiple dictionaries.
 
FlightNo:
Must start with valid carrier codes: <AIRLINE_PREFIX>.
Format: airline code + number
do not take the date value for the flight number if there is flight date attached with flight number (eg KE706/18APR in this only take KE706)
if no values are found keep as null
 
Departure-date:
Extract in YYYY-MM-DD format.
If given as a range like 23/24/07, choose the latest date (i.e., 2025-07-24).
If given as a relative day (e.g., "next Monday"), assume today's date is 2025-03-22 (Saturday) and resolve accordingly.
 
total-pieces:
Integer value representing total cargo pieces.
 
pieces@dimensions:
Format: list like ["2@24x17x9"].
May appear as pcs x l x b x h or pcs @ l x b x h.
Extract all combinations; prioritize individual dimensions over total.
 
dimension-unit:
Supported units: "CM", "M", "IN", "OTH".
Provide as a list matching the sequence of pieces@dimensions.
 
Weight:
If individual weights are given, compute the total.
Use chargeable weight (CW) or gross weight (G/W) if explicitly mentioned.
If weight is embedded in piece-dimension combos, extract accordingly.
 
weight-unit:
Supported values: "KG", "KGS", "LBS", "OTH".
 
special-instruction:
Extract any special handling notes.
Always translate to English.
 
commodity-description:
Free text describing the goods.
Always translate to English.
 
product-code:
If not explicitly given, infer from commodity description:
"GEN" for general cargo
"HAZ" for hazardous materials
"DG" for dangerous goods
 
Source / Destination:
Extract from IATA codes in formats like EWR-OME (EWR = Source, OME = Destination).
Do not assume source location from sender's location or from flight number.

### EXTRACTION RULES:
- Extract **only for the single email above**.
- Output must include only fields you found.
- If a field is missing, return null (no guessing).
- Do NOT include any other text, explanation, markdown, or tags.
- Output must be valid JSON and must **end after the first array**.

### STOP CONDITIONS:
- ❌ Do NOT repeat the JSON.
- ❌ Do NOT generate for other emails.
- ✅ Your response must end immediately after the closing square bracket `]`.
- ✅ Append `### END JSON` after the closing bracket to indicate completion.
- ⚠️ If your generation continues after that, it will be rejected.

Now return only the extracted JSON output for the above email.
ABSOLUTLEY NO EXPLANATION
OMIT ALL EXPLANATIONS AND FORCE STOP GENERATION AFTER JSON FILES ARE GENERATED
"""

In [6]:
import json
with open(r"D:\crf\processed_emails_with_ground_truth.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Preprocess
processed = []
for item in raw_data:
    subject = item.get("subject", "")
    body = item.get("body", "")
    sender = item.get("from", "")
    receiver = item.get("to", "")
    text = f"{prompt_template} Subject: {subject} | Body: {body} | From: {sender} | To: {receiver}"
    processed.append({"input_text": text})

# Initialize dataset and dataloader
dataset = EmailDataset(processed, tokenizer)

In [7]:
from datasets import Dataset
tokenizer.pad_token = tokenizer.eos_token

# Convert your processed list of dicts to a HuggingFace dataset
hf_dataset = Dataset.from_list(processed)

# Define tokenizer function
def tokenize(batch):
    return tokenizer(batch["input_text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer to the dataset
tokenized_ds = hf_dataset.map(tokenize, batched=True)

# Optional: set format to PyTorch tensors
tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

In [8]:
# For causal LM, labels = input_ids
def add_labels(batch):
    batch["labels"] = batch["input_ids"].clone()
    return batch

tokenized_ds = tokenized_ds.map(add_labels)

tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,  # if using GPU
)

from transformers import AutoModelForCausalLM

# Replace 'model_name' with your model checkpoint
# model = AutoModelForCausalLM.from_pretrained("model")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds
)

trainer.train()




Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Step,Training Loss
10,2.7726
20,2.2506
30,1.6027
40,0.8823
50,0.4873
60,0.3083
70,0.2082
80,0.1039
90,0.0584
100,0.0494


TrainOutput(global_step=222, training_loss=0.414139936434793, metrics={'train_runtime': 87.5565, 'train_samples_per_second': 5.071, 'train_steps_per_second': 2.536, 'total_flos': 2561443387932672.0, 'train_loss': 0.414139936434793, 'epoch': 3.0})

In [None]:
model.save_pretrained("./8b_adptv2")
tokenizer.save_pretrained("./8b_adptv2")