In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
from textbrewer import GeneralDistiller, TrainingConfig, DistillationConfig
from torch.optim import AdamW
from torch.utils.data import Dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cpu"  # You can change to 'cuda' if GPU available

teacher_path =  r"C:\Users\211369\Desktop\program\distil\llama3.2_3b"
teacher = AutoModelForCausalLM.from_pretrained(
    teacher_path,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)
# teacher = AutoModelForCausalLM.from_pretrained(teacher_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(teacher_path)


Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.16s/it]


In [3]:
student_path =  r"C:\Users\211369\Desktop\program\distil\llama3.2_1b"
student = AutoModelForCausalLM.from_pretrained(student_path).to(device)

In [4]:

prompt_template = """You are a helpful assistant. return awb number, :
❗ Important:
- Do **not** use given examples for any fields
- Do **not** use any parts from the prompt as fields
- Only return the final JSON object.
- Do **not** add any explanation, markdown formatting, or code.
- Do **not** include backticks (```) or language tags like ```json.
- Do **not** generate Python or any other code.
- If data is missing, return `null`, but do not fabricate.
- Output must be valid and clean JSON.
- Only take values from the given mail
 
 
 
 
Subject: {subject}
From: {from_}
To: {to}
Body:{body}
######################################################
 You are assisting the Airline Cargo team in extracting specific business-critical entities from customer emails. Each email consists of a subject and body, both delimited by triple backticks (```).
 
You must return a JSON array of dictionaries, each representing one AWB (Air Waybill) entry with its corresponding entities. The structure should follow these rules:
Return a JSON array of dictionaries.
Each dictionary must include the extracted AWB and its associated entities.
Omit any fields not found in the email; DO NOT assume values.
 
The final output must be pure JSON: no explanations, no backticks, no extra text.
The json output should appear as per the following format
[
    {{
        "AWB": "",
        "FlightNo": "",
        "Departure-date": "",
        "total-pieces": ,
        "pieces@dimensions": [""],
        "dimension-unit": [""],
        "Weight": ,
        "weight-unit": "",
        "special-instruction": "",
        "commodity-description": "",
        "product-code": "",
        "Source": "",
        "Destination": ""
    }}
]
AWB (Air Waybill):
Must be 11-digit numbers starting with valid airline prefixes: <AWB_PREFIX>.
May be referred to as "MAWB" or "GUIA".
Remove hyphens or spaces.
One dictionary per AWB; multiple AWBs = multiple dictionaries.
 
FlightNo:
Must start with valid carrier codes: <AIRLINE_PREFIX>.
Format: airline code + number
do not take the date value for the flight number if there is flight date attached with flight number (eg KE706/18APR in this only take KE706)
if no values are found keep as null
 
Departure-date:
Extract in YYYY-MM-DD format.
If given as a range like 23/24/07, choose the latest date (i.e., 2025-07-24).
If given as a relative day (e.g., "next Monday"), assume today's date is 2025-03-22 (Saturday) and resolve accordingly.
 
total-pieces:
Integer value representing total cargo pieces.
 
pieces@dimensions:
Format: list like ["2@24x17x9"].
May appear as pcs x l x b x h or pcs @ l x b x h.
Extract all combinations; prioritize individual dimensions over total.
 
dimension-unit:
Supported units: "CM", "M", "IN", "OTH".
Provide as a list matching the sequence of pieces@dimensions.
 
Weight:
If individual weights are given, compute the total.
Use chargeable weight (CW) or gross weight (G/W) if explicitly mentioned.
If weight is embedded in piece-dimension combos, extract accordingly.
 
weight-unit:
Supported values: "KG", "KGS", "LBS", "OTH".
 
special-instruction:
Extract any special handling notes.
Always translate to English.
 
commodity-description:
Free text describing the goods.
Always translate to English.
 
product-code:
If not explicitly given, infer from commodity description:
"GEN" for general cargo
"HAZ" for hazardous materials
"DG" for dangerous goods
 
Source / Destination:
Extract from IATA codes in formats like EWR-OME (EWR = Source, OME = Destination).
Do not assume source location from sender’s location or from flight number.
 
Must Translate all extracted text into  json.
Do not fabricate missing values.
Always return a clean JSON output only — no markdown, no backticks, no wrapping text.
Must Not generate anything other than the base json file. Do NOT generate any code.
Only the output is required do not generate anything else
 
since there is a json format given generate just as the json format. Do not generate in loop. if it start to generate in loop stop the generation.
 
"""

     

In [5]:
class PromptResponseDataset(Dataset):
    def __init__(self, file_path, tokenizer, prompt_template, max_length=512):
        self.tokenizer = tokenizer
        self.prompt_template = prompt_template  # <-- store the prompt
        self.max_length = max_length
        self.samples = []

        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for item in data:
            subject = item.get("subject", "")
            from_ = item.get("from", "")
            to = item.get("to", "")
            body = item.get("body", "")

            self.samples.append({
                "subject": subject,
                "from_": from_,
                "to": to,
                "body": body
            })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        subject = sample["subject"]
        from_ = sample["from_"]
        to = sample["to"]
        body = sample["body"]

        # Use the shared prompt template
        prompt = self.prompt_template.format(subject=subject, from_=from_, to=to, body=body)

        inputs = self.tokenizer(
            prompt,
            return_tensors='pt',
            max_length=self.max_length,
            truncation=True,
            padding='max_length'
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": inputs["input_ids"].squeeze(0).clone(),
            "prompt": prompt,
            "subject": subject,
            "from_": from_,
            "to": to,
            "body": body
        }


In [6]:
file_path = r"C:\Users\211369\Desktop\program\structured_outputs.json"
dataset = PromptResponseDataset(file_path, tokenizer, prompt_template)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)  

In [7]:
from torch.utils.data import DataLoader
tokenizer.pad_token = tokenizer.eos_token
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for batch in dataloader:
    prompts = batch["prompt"]  # batch["prompt"] is a list of strings
    for prompt in prompts:
        print("📨 Prompt:\n", prompt)
    break



📨 Prompt:
 You are a helpful assistant. return awb number, :
❗ Important:
- Do **not** use given examples for any fields
- Do **not** use any parts from the prompt as fields
- Only return the final JSON object.
- Do **not** add any explanation, markdown formatting, or code.
- Do **not** include backticks (```) or language tags like ```json.
- Do **not** generate Python or any other code.
- If data is missing, return `null`, but do not fabricate.
- Output must be valid and clean JSON.
- Only take values from the given mail




Subject: [External] <AR> : EY9557/ 11FEB2024 DXBAUH
From: Shaheen Shafeek <SShafeek@etihad.ae>
To: 
Body:ATTENTION: This email originated from a source outside of our organization. Please ensure that you recognize the sender and the content is safe before you open any attachments or click any links.
EY BOOKING
Please note below booking is not ready to join subject trucking, please advise Shipper to approach DNATA and complete Security formalities.
Booking is now Q

In [8]:
tokenizer.pad_token = tokenizer.eos_token
batch = next(iter(train_loader))
with torch.no_grad():
    t_out = teacher(**batch)
s_out = student(**batch)
print(t_out.logits.shape)  # [B, T, V]
print(s_out.logits.shape)

torch.Size([1, 512, 128256])
torch.Size([1, 512, 128256])


In [9]:
optimizer = AdamW(student.parameters(), lr=5e-5)

train_config = TrainingConfig(
    device='cpu',                  # or 'cpu' if no GPU
    output_dir='./saved_model',
    log_dir='./log',
)

distill_config = DistillationConfig(
    temperature=2.0,
    kd_loss_type='ce',              # 'ce' for KLDiv, 'mse' for regression
    kd_loss_weight=1.0,             # use 1.0 for pure distillation
    hard_label_weight=1.0,          # set 0.0 to ignore ground-truth
)

In [10]:

def get_adaptor(batch, model_output):
    return {
        "logits": model_output.logits,
        "labels": batch.get("labels", None) 
    }

def batch_postprocessor(batch):
    return {
        "input_ids": batch["input_ids"].to(device),
        "attention_mask": batch["attention_mask"].to(device),
        "labels": batch["labels"].to(device),
    }
def callback(step=None, loss=None, lr=None, model=None):
    if loss is not None:
        print(f"[Step {step}] Total Loss: {loss:.4f} | LR: {lr:.6f}")
    else:
        print(f"[Step {step}] Loss is None")


In [11]:
tokenizer.pad_token = tokenizer.eos_token

distiller = GeneralDistiller(
    train_config=train_config,
    distill_config=distill_config,
    model_T=teacher,
    model_S=student,
    adaptor_T=get_adaptor,
    adaptor_S=get_adaptor,
    
)


In [12]:
print(distiller.__class__)


<class 'textbrewer.distiller_general.GeneralDistiller'>


In [13]:
# batch = next(iter(train_loader))
# loss, loss_dict = distiller.train_on_batch(batch, {})
# print("Loss:", loss)


In [14]:
tokenizer.pad_token = tokenizer.eos_token
print(distiller.__class__)
from types import MethodType

def force_general_train(self, *args, **kwargs):
    print("✅ This is GeneralDistiller.train()")
    return GeneralDistiller.train(self, *args, **kwargs)

distiller.train = MethodType(force_general_train, distiller)

original_train_on_batch = distiller.train_on_batch

def wrapped_train_on_batch(self, batch, args):
    loss, loss_dict = original_train_on_batch(batch, args)
    print(f"[DISTILL STEP] Loss: {loss.item():.4f}")
    return loss, loss_dict

from types import MethodType
distiller.train_on_batch = MethodType(wrapped_train_on_batch, distiller)

distiller.train_with_num_epochs(
    optimizer=optimizer,
    scheduler=None,
    tqdm_disable=False,
    dataloader=train_loader,
    max_grad_norm=1.0,
    num_epochs=5,
    callback=callback,
    batch_postprocessor=None
)   

<class 'textbrewer.distiller_general.GeneralDistiller'>


  0%|          | 0/5 [00:00<?, ?it/s]

[DISTILL STEP] Loss: 9.6001


1it [07:29, 449.20s/it]
 20%|██        | 1/5 [07:29<29:56, 449.22s/it]

[Step 1] Loss is None




[DISTILL STEP] Loss: 10.1587


1it [06:36, 396.78s/it]
 40%|████      | 2/5 [14:06<20:55, 418.44s/it]

[Step 2] Loss is None




[DISTILL STEP] Loss: 9.9698


1it [05:13, 313.59s/it]
 60%|██████    | 3/5 [19:19<12:21, 370.57s/it]

[Step 3] Loss is None




[DISTILL STEP] Loss: 9.7829


1it [05:36, 336.62s/it]
 80%|████████  | 4/5 [24:56<05:57, 357.17s/it]

[Step 4] Loss is None




[DISTILL STEP] Loss: 9.6582


1it [05:28, 328.36s/it]
100%|██████████| 5/5 [30:24<00:00, 364.95s/it]

[Step 5] Loss is None





In [15]:
distilled_model=r"C:\Users\211369\Desktop\program\distil\distil2"
student.save_pretrained(distilled_model)
tokenizer.save_pretrained(distilled_model)

('C:\\Users\\211369\\Desktop\\program\\distil\\distil2\\tokenizer_config.json',
 'C:\\Users\\211369\\Desktop\\program\\distil\\distil2\\special_tokens_map.json',
 'C:\\Users\\211369\\Desktop\\program\\distil\\distil2\\tokenizer.json')

In [16]:
import email
from email import policy

# Load the .eml file
with open(r"C:\Users\211369\Desktop\New folder (3)\[External] (AR) NO SHOW   EY9557 _ 26JAN24_ DXBAUH - Cargo Handling DXB (cargohandling_DXB@etihad.ae) - 2024-01-26 2052.eml", "r", encoding="utf-8") as f:
    msg = email.message_from_file(f, policy=policy.default)

# Extract parts
subject = msg["subject"]
from_ = msg["from"]
to = msg["to"]
date = msg["date"]

# Extract body (handles plain or multipart)
if msg.is_multipart():
    for part in msg.walk():
        if part.get_content_type() == "text/plain":
            body = part.get_content()
            break
else:
    body = msg.get_content()

# Combine all into one string
email_string = f"""From: {from_}
To: {to}
Date: {date}
Subject: {subject}

{body}
"""

NameError: name 'body' is not defined

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Path to your local model (e.g., distilled_model)
model_path = r"C:\Users\211369\Desktop\program\distil\distil_llm"
device = "cpu"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Set pad token (important for causal models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Input prompt
prompt =  email_string


# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate output
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=2000,
        do_sample=True,
        top_k=10,
        top_p=0.5,
        temperature=0.9,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode and print
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("🧠 Model response:\n", response)
