In [18]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

In [None]:
data = pd.read_csv("data/labeled01.csv", encoding='cp1252')
data = data[25:151]

In [None]:
train_data, val_data = train_test_split(data, test_size=0.1)

In [None]:
# Define a function to encode the data
def encode_data(data, tokenizer):
    input_ids = []
    attention_masks = []
    labels = []
    for row in data.itertuples():
        encoded_dict = tokenizer(row.resolution, max_length=512, padding='max_length', truncation=True)
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(tokenizer.encode(row.summary, max_length=128, padding='max_length', truncation=True))
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

In [None]:
from transformers import T5Tokenizer
import torch

# Encode the training and validation data
tokenizer = T5Tokenizer.from_pretrained('t5-base')
train_input_ids, train_attention_masks, train_labels = encode_data(train_data, tokenizer)
val_input_ids, val_attention_masks, val_labels = encode_data(val_data, tokenizer)

# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [None]:
from transformers import T5ForConditionalGeneration

# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch', # Set the save strategy to match the evaluation strategy
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    num_train_epochs=20,
    report_to=[],
    load_best_model_at_end=True, # Load the best model at the end of training
    save_total_limit=3, # Only save the latest checkpoint
)

# Define a function for computing the evaluation metrics
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = torch.argmax(logits, dim=-1)
#     acc = (predictions == labels).float().mean().item()
#     return {'accuracy': acc}

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.tensor(logits[0])  # Convert the logits array to a tensor
    labels = torch.tensor(labels)  # Convert the labels array to a tensor
    predictions = torch.argmax(logits, dim=-1)
    acc = (predictions == labels).float().mean().item()
    return {'accuracy': acc}

In [None]:
from torch.utils.data.dataloader import default_collate

def custom_data_collator(features):
    input_ids = default_collate([f[0] for f in features])
    attention_masks = default_collate([f[1] for f in features])
    labels = default_collate([f[2] for f in features])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

In [None]:
from transformers import EarlyStoppingCallback
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Fine-tune the model
trainer.train()