In [None]:
from transformers import TrainerCallback, TrainerState

class CaptureMetricsCallback(TrainerCallback):
    def on_log(self, args, state: TrainerState, control, model, logs=None, **kwargs):
        if logs is not None:
            with open('training_logs.txt', 'a') as f:
                f.write(f"Epoch: {state.epoch}, Step: {state.global_step}, Training Loss: {logs.get('loss', 'N/A')}, Validation Loss: {logs.get('eval_loss', 'N/A')}\n")

#Training with intention or procedures only
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM , Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset
import json
from sklearn.model_selection import train_test_split
import os 
import numpy as np
import random

def set_seed(seed_value=42):
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-large')

class T5Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

with open('FlanT5LargeProcedure (Data with GPT)/train_data_v2-1.json', 'r') as f:
    train_data = json.load(f)

with open('FlanT5LargeProcedure (Data with GPT)/val_data_v2.json', 'r') as f:
    val_data = json.load(f)

train_texts = []
train_labels = []
val_texts = []
val_labels = []

for item in train_data:
    input_text = f"Generate a procedure based on the text: {item['raw_text']} and the intention: {item['intention']}"
    train_texts.append(input_text)
    train_labels.append(f"{item['procedure']}")  

for item in val_data:
    input_text = f"Generate a procedure based on the text: {item['raw_text']} and the intention: {item['intention']}"
    val_texts.append(input_text)
    val_labels.append(f"{item['procedure']}")
    
#n = 5  
#for sample_index in range(n):
#    print(f"Sample {sample_index+1}")
#    print("Original Text: ", train_texts[sample_index])
#    print("Tokenized Text: ", tokenizer.decode(train_encodings['input_ids'][sample_index]))
#    print("Original Label: ", train_labels[sample_index])
#    print("Tokenized Label: ", tokenizer.decode(tokenizer(train_labels[sample_index])['input_ids']))
#    print("------")\
    
#for item in train_data:
#    train_texts.append(item['intention'])
#    train_labels.append(f"Procedure: {item['procedure']}")

#for item in val_data:
#    val_texts.append(item['intention'])
#    val_labels.append(f"Procedure: {item['procedure']}")    
                        
#for item in train_data:
#    train_texts.append(item['raw_text'])
#    train_labels.append(f"Procedure: {item['procedure']}")

#for item in val_data:
#    val_texts.append(item['raw_text'])
#   val_labels.append(f"Procedure: {item['procedure']}")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_labels = tokenizer(train_labels, truncation=True, padding=True)
val_labels = tokenizer(val_labels, truncation=True, padding=True)



train_dataset = T5Dataset(train_encodings, train_labels)
val_dataset = T5Dataset(val_encodings, val_labels)

training_args = Seq2SeqTrainingArguments(
    output_dir='FlanT5LargeProcedure (Data with GPT)',          
    num_train_epochs=12,              
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    #warmup_steps=140,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logsfolder',            # directory for storing logs
    load_best_model_at_end=True,     
    logging_steps=10,                # log & save weights each logging_steps
    evaluation_strategy='epoch',     # evaluation strategy to adopt during training
    save_strategy='epoch',
    learning_rate=5e-5,              # learning rate
    report_to=[],                
)

#model = AutoModelForSeq2SeqLM.from_pretrained('t5-large')
#model = AutoModelForSeq2SeqLM.from_pretrained('t5-large')
model = AutoModelForSeq2SeqLM.from_pretrained('FlanT5LargeProcedure (Data with GPT)/checkpoint-1190')

torch.cuda.empty_cache()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[CaptureMetricsCallback()],
)

trainer.train(resume_from_checkpoint=True)
#trainer.train()

trainer.save_model('./FlanT5LargeProcedure -- (Data with GPT))')

In [None]:
!pip install rouge_score

In [1]:
import json
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained('FlanT5LargeProcedure (Data with GPT)/checkpoint-1309')
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-large')

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

with open('FlanT5LargeProcedure (Data with GPT)/test_data_v2.json', 'r') as f:
    data = json.load(f)

#raw_texts = [f"{item['raw_text']} [SEP] {item['intention']}" for item in data]
raw_texts = [f"Generate a procedure based on the text: {item['raw_text']} and the intention: {item['intention']}" for item in data]
#raw_texts = [f"{item['intention']}" for item in data]

inputs = tokenizer(raw_texts, return_tensors='pt', truncation=True, padding=True)

outputs = model.generate(**inputs, max_length=256)

predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

data_with_predictions = []
for item, prediction in zip(data, predictions):
    new_item = item.copy()
    new_item['model_output'] = prediction  # The generated procedure

    correct_output = f"{item['procedure']}"
    scores = scorer.score(correct_output, prediction)
    
    # Include ROUGE scores as individual fields
    new_item['rouge1'] = scores['rouge1'].fmeasure
    new_item['rougeL'] = scores['rougeL'].fmeasure

    # For BLEU
    reference = [correct_output.split()]
    candidate = prediction.split()
    bleu_score = sentence_bleu(reference, candidate)
    new_item['bleu_score'] = bleu_score

    data_with_predictions.append(new_item)

with open('Procedure_Predictions(FLAN-T5(1309)).json', 'w') as f:
    json.dump(data_with_predictions, f)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
