In [23]:
import torch, os
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_from_disk, Dataset
import pandas as pd

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

torch.set_float32_matmul_precision("high")  # Optimize matrix multiplications

In [25]:
df = pd.read_csv("../airline_incidents.csv")  # Ensure the file is in the same directory
df = df.dropna()  # Remove missing values
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle dataset

In [26]:
# Define the Model and Tokenizer
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

In [27]:
# Prepare Dataset for T5 (Text-to-Text)
def preprocess_function(example):
    """Tokenizes input and output text for T5"""
    model_input = f"incident: {example['report']}"  # Prompting format
    target_text = f"failure: {example['part failure']}"

    model_inputs = tokenizer(model_input, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(target_text, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]  # Assign decoder labels

    return model_inputs

In [28]:
# Convert DataFrame to HuggingFace Dataset
if os.path.exists("../processed_dataset"):
    # Load from disk if it exists
    dataset = load_from_disk("../processed_dataset")
else:
    # Process and save if not found
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(preprocess_function, remove_columns=["report", "part failure"])
    dataset.save_to_disk("processed_dataset")



In [29]:
# Train-Test Split
dataset = dataset.train_test_split(test_size=0.2)  # 80% Training, 20% Validation


In [30]:
# Data Collator (Pads batch inputs dynamically)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [31]:
training_args = TrainingArguments(
    output_dir="../t5_airline_incidents",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=1,  # Helps with large models
    bf16=False,
    fp16=True ,
    save_total_limit=2,  # Manage checkpoints
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    optim="adamw_torch_fused",  # Optimized optimizer for ROCm
    report_to="none",
    logging_strategy = "steps",
    logging_steps = 100,  # Log every 100 steps
    eval_steps = 500  # Evaluate every 500 training steps

)



In [32]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [33]:
# Start Training
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0428,0.037454
2,0.0349,0.032665
3,0.0323,0.030881


TrainOutput(global_step=3753, training_loss=0.056021749957415254, metrics={'train_runtime': 1737.1039, 'train_samples_per_second': 138.199, 'train_steps_per_second': 2.16, 'total_flos': 3.249096491217715e+16, 'train_loss': 0.056021749957415254, 'epoch': 3.0})

In [34]:
# SAVE THE FINE-TUNED MODEL
model.save_pretrained("./t5_finetuned_airline_incidents")
tokenizer.save_pretrained("./t5_finetuned_airline_incidents")

('./t5_finetuned_airline_incidents/tokenizer_config.json',
 './t5_finetuned_airline_incidents/special_tokens_map.json',
 './t5_finetuned_airline_incidents/spiece.model',
 './t5_finetuned_airline_incidents/added_tokens.json')