# chuẩn bị dữ liệu

In [1]:
import json
from datasets import Dataset, DatasetDict
# from transformers import T5Tokenizer

# Đọc dữ liệu từ file JSON
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# Tạo các tác vụ
def create_tasks(data):
    task_data = []
    for sample in data:
        raw_text = sample["raw_words"]
        aspects = sorted([(" ".join(a["term"]), a["polarity"], a["index"]) for a in sample["aspects"]], key=lambda x: x[2])
        opinions = sorted([(" ".join(o["term"]), o["index"]) for o in sample["opinions"]], key=lambda x: x[1])
        
        # Ensure aspects and opinions are paired by index
        aspect_opinions = []
        for aspect in aspects:
            aspect_term, _, aspect_index = aspect
            corresponding_opinions = [o[0] for o in opinions if o[1] == aspect_index]
            for opinion in corresponding_opinions:
                aspect_opinions.append((aspect_term, opinion))
        
        # Generate aspect-opinion-sentiment pairs
        aspect_opinion_sentiments = []
        for aspect in aspects:
            aspect_term, sentiment, aspect_index = aspect
            corresponding_opinions = [o[0] for o in opinions if o[1] == aspect_index]
            for opinion in corresponding_opinions:
                aspect_opinion_sentiments.append((aspect_term, opinion, sentiment))

        # Tác vụ 1: I -> A
        task_data.append({
            "task": "task_1",
            "text_input": "Task 1, predict aspect terms" + 
                          "\nInput: " + raw_text,
            "text_label": "; ".join(a[0] for a in aspects)
        })
        
        # Tác vụ 2: I -> O
        task_data.append({
            "task": "task_2",
            "text_input": "Task 2, predict opinion terms" + 
                          "\nInput: " + raw_text,
            "text_label": "; ".join(o[0] for o in opinions)
        })
        
        # Tác vụ 3: I, A -> A-O
        task_data.append({
            "task": "task_3",
            "text_input": "Task 3, predict aspect term - opinion term sets with these aspect terms" + 
                          "\nInput: " + raw_text + 
                          "\nAspect terms: " + "; ".join(a[0] for a in aspects),
            "text_label": "; ".join(f"{a}, {o}" for a, o in aspect_opinions)
        })
        
        # Tác vụ 4: I, O -> A-O
        task_data.append({
            "task": "task_4",
            "text_input": "Task 4, predict aspect term - opinion term sets with these opinion terms" + 
                          "\nInput: " + raw_text + 
                          "\nOpinion terms: " + "; ".join(o[0] for o in opinions),
            "text_label": "; ".join(f"{a}, {o}" for a, o in aspect_opinions)
        })
        
        # Tác vụ 5: I, A, O -> A-O
        task_data.append({
            "task": "task_5",
            "text_input": "Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms" + 
                          "\nInput: " + raw_text + 
                          "\nAspect terms: " + "; ".join(a[0] for a in aspects) + 
                          "\nOpinion terms: " + "; ".join(o[0] for o in opinions),
            "text_label": "; ".join(f"{a}, {o}" for a, o in aspect_opinions)
        })
        
        # Tác vụ 6: I, A-O -> A-O
        task_data.append({
            "task": "task_6",
            "text_input": "Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5" + 
                          "\nInput: " + raw_text + 
                          "\nTask 3 results: " + "; ".join(f"{a}, {o}" for a, o in aspect_opinions) + 
                          "\nTask 4 results: " + "; ".join(f"{a}, {o}" for a, o in aspect_opinions) +
                          "\nTask 5 results: " + "; ".join(f"{a}, {o}" for a, o in aspect_opinions),
            "text_label": "; ".join(f"{a}, {o}" for a, o in aspect_opinions)
        })
        
        # Tác vụ 7: I, A-O -> A-O-S
        task_data.append({
            "task": "task_7",
            "text_input": "Task 7, predict aspect term - opinion term - sentiment polarity sets with these aspect term - opinion term sets" + 
                          "\nInput: " + raw_text + 
                          "\nAspect term - opinion term sets: " + "; ".join(f"{a}, {o}" for a, o in aspect_opinions),
            "text_label": "; ".join(f"{a}, {o}, {s}" for a, o, s in aspect_opinion_sentiments)
        })

    return task_data

# tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Chuyển đổi sang Dataset của Hugging Face
def convert_to_dataset(task_data):
    # max_len_token = 0
    tokenized_data = {
        "task": [],
        "text_input": [],
        "text_label": [],
    }
    for item in task_data:
        # Tokenize input và label
        # input_tokens = tokenizer.encode(item["text_input"], add_special_tokens=True)
        # label_tokens = tokenizer.encode(item["text_label"], add_special_tokens=True)
        # max_len_token = max(len(input_tokens) + len(label_tokens), max_len_token)
        tokenized_data["task"].append(item["task"])
        tokenized_data["text_input"].append(item["text_input"])
        tokenized_data["text_label"].append(item["text_label"])
    # print(max_len_token)
    return Dataset.from_dict(tokenized_data)


In [3]:
# Đường dẫn đến dữ liệu
train_file = "/chain-of-thought-ABSA/Dataset/SemEval14/Train/Laptops_Opinion_Train.json"
validation_file = "/chain-of-thought-ABSA/Dataset/SemEval14/Validation/Laptops_Opinion_Validation.json"

# Load train và validation data
train_data = load_data(train_file)
validation_data = load_data(validation_file)

# Tạo các tác vụ
train_tasks = create_tasks(train_data)
validation_tasks = create_tasks(validation_data)

# Chuyển đổi thành DatasetDict
train_dataset = convert_to_dataset(train_tasks)
validation_dataset = convert_to_dataset(validation_tasks)

In [4]:
# Kiểm tra dữ liệu
print(train_dataset)
print(validation_dataset)
print(validation_dataset[0])

Dataset({
    features: ['task', 'text_input', 'text_label'],
    num_rows: 6342
})
Dataset({
    features: ['task', 'text_input', 'text_label'],
    num_rows: 1533
})
{'task': 'task_1', 'text_input': 'Task 1, predict aspect terms\nInput: In the shop , these MacBooks are encased in a soft rubber enclosure - so you will never know about the razor edge until you buy it , get it home , break the seal and use it ( very clever con ) .', 'text_label': 'rubber enclosure'}


# huấn luyện mô hình pretraining

In [5]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader
from transformers import AdamW
from datasets import Dataset
import os

In [6]:
# Load preprocessed dataset
def collate_fn(batch):
    input_texts = [item["text_input"] for item in batch]
    target_texts = [item["text_label"] for item in batch]
    tokenized_inputs = tokenizer(input_texts, padding=True, 
                                #  truncation=True, 
                                 return_tensors="pt")
    tokenized_labels = tokenizer(target_texts, padding=True, 
                                #  truncation=True, 
                                 return_tensors="pt").input_ids

    # Replace padding tokens in labels with -100 for loss calculation
    tokenized_labels[tokenized_labels == tokenizer.pad_token_id] = -100
    return {
        "input_ids": tokenized_inputs.input_ids,
        "attention_mask": tokenized_inputs.attention_mask,
        "labels": tokenized_labels,
    }

# Define dataset and data loaders
def get_dataloaders(train_dataset, val_dataset, batch_size=16):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, pin_memory=True)
    return train_dataloader, val_dataloader

# Training function
def train_model(model, train_dataloader, val_dataloader, optimizer, num_epochs=20, patience=5):
    best_val_loss = float("inf")
    no_improve_epochs = 0

    if not os.path.exists("pretrain_loss.txt"):
        with open("pretrain_loss.txt", "w") as f:
            f.write("Epoch\tTrain Loss\tValidation Loss\n")

    if not os.path.exists("pretrain_model"):
        os.makedirs("pretrain_model")

    for epoch in range(num_epochs):
        torch.cuda.empty_cache()

        model.train()
        train_loss = 0.0
        for batch in train_dataloader:
            optimizer.zero_grad()
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                labels=batch["labels"].to(device)
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_dataloader)

        # Validation step
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_dataloader:
                outputs = model(
                    input_ids=batch["input_ids"].to(device),
                    attention_mask=batch["attention_mask"].to(device),
                    labels=batch["labels"].to(device)
                )
                val_loss += outputs.loss.item()

        val_loss /= len(val_dataloader)

        # Logging losses
        with open("pretrain_loss.txt", "a") as f:
            f.write(f"{epoch + 1}\t{train_loss:.4f}\t{val_loss:.4f}\n")

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model_save_path = os.path.join("pretrain_model", "best_model_epoch_{:02d}.pt".format(epoch + 1))
            torch.save(model.state_dict(), model_save_path)
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1

        # Early stopping
        if no_improve_epochs >= patience:
            print("Early stopping triggered.")
            break

In [6]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [7]:
# Load tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

# Prepare dataloaders
train_dataloader, val_dataloader = get_dataloaders(train_dataset, validation_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model
# train_model(model, train_dataloader, val_dataloader, optimizer, num_epochs=100, patience=5)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
for batch in train_dataloader:
    continue
batch

{'input_ids': tensor([[16107,  3547,  9689,  3474,  1353,    86,  2562,    10,    37,   594,
             47,   182,   207,     3,     6,    11,     8,   556,    19,   420,
            463,     3,     5,     1,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [16107,  6464,  9689,  2663,  1657,     3,    18,  3474,  1657,  3369,
             28,   175,  3474,  1353,    86,  2562,    10,    37,  8680,    19,
            420,     3, 16360,     3,     5,   411, 22441,  1353,    10,   420,
              3, 16360,     1,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
  

In [20]:
# # Count the number of parameters in the model
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

# num_params = count_parameters(model)
# print(f"The model has {num_params:,} trainable parameters.")