# xây dựng tập huấn luyện

In [1]:
import ast
import random
from torch.utils.data import DataLoader
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

In [2]:
def process_train_data(file_path):
    # Đọc dữ liệu từ file
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    # Tạo các tác vụ
    task_data = {}
    for i in range(len(lines)):
        text_sentence, aspect_category_sentiment_opinions_str = lines[i].strip().split("####")
        aspect_category_sentiment_opinions = ast.literal_eval(aspect_category_sentiment_opinions_str)
        # Tạo các chuỗi cần thiết
        aspects_str = "; ".join(sorted(set([aspect for aspect, _, _, _ in aspect_category_sentiment_opinions])))
        opinions_str = "; ".join(sorted(set([opinion for _, _, _, opinion in aspect_category_sentiment_opinions])))
        aspect_opinions_str = "; ".join(sorted(set([f"{aspect}, {opinion}" for aspect, _, _, opinion in aspect_category_sentiment_opinions])))
        aspect_opinions_sentiment_categories_str = "; ".join(sorted([f"{aspect}, {opinion}, {sentiment}, {category}" for aspect, category, sentiment, opinion in aspect_category_sentiment_opinions]))

        # Tác vụ 1: I -> A
        task_data[f"{i}_1"] = {
            "id": i,
            "task": 1,
            "text_label": aspects_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
        # Tác vụ 2: I -> O
        task_data[f"{i}_2"] = {
            "id": i,
            "task": 2,
            "text_label": opinions_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
        # Tác vụ 3: I, A -> A-O
        task_data[f"{i}_3"] = {
            "id": i,
            "task": 3,
            "text_label": aspect_opinions_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
        # Tác vụ 4: I, O -> A-O
        task_data[f"{i}_4"] = {
            "id": i,
            "task": 4,
            "text_label": aspect_opinions_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
        # Tác vụ 5: I, A, O -> A-O
        task_data[f"{i}_5"] = {
            "id": i,
            "task": 5,
            "text_label": aspect_opinions_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
        # Tác vụ 6: I, A-O, A-O, A-O -> A-O
        task_data[f"{i}_6"] = {
            "id": i,
            "task": 6,
            "text_label": aspect_opinions_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
        # Tác vụ 7: I, A-O -> A-C-S-O
        task_data[f"{i}_7"] = {
            "id": i,
            "task": 7,
            "text_label": aspect_opinions_sentiment_categories_str,
            "text_predict": "",
            "text_sentence": text_sentence,
        }
    return task_data

In [3]:
# Đường dẫn đến file
# train_data = process_train_data("/chain-of-thought-ABSA/data/acos/rest16/train.txt")
# train_data

In [4]:
def collate_fn(batch_keys, train_data, tokenizer):
    text_inputs, text_labels = [], []
    for key in batch_keys:
        item_data = train_data[key]

        id = item_data["id"]
        task = item_data['task']
        text_sentence = item_data["text_sentence"]

        # Task 1
        if task == 1:
            text_input = f"Task 1, predict aspects\nInput: {text_sentence}"
        # Task 2
        if task == 2:
            text_input = f"Task 2, predict opinions\nInput: {text_sentence}"
        # Task 3: Sử dụng predict từ task1
        if task == 3:
            task_1_output = train_data[f"{id}_1"]["text_predict"]
            text_input = f"Task 3, predict aspect-opinion sets with these aspects\nInput: {text_sentence}\nAspects: {task_1_output}"
        # Task 4: Sử dụng predict từ task2
        if task == 4:
            task_2_output = train_data[f"{id}_2"]["text_predict"]
            text_input = f"Task 4, predict aspect-opinion sets with these opinions\nInput: {text_sentence}\nOpinions: {task_2_output}"
        # Task 5: Sử dụng predict từ task1 và task2
        if task == 5:
            task_1_output = train_data[f"{id}_1"]["text_predict"]
            task_2_output = train_data[f"{id}_2"]["text_predict"]
            text_input = f"Task 5, predict aspect-opinion sets with these aspects and opinions\nInput: {text_sentence}\nAspects: {task_1_output}\nOpinions: {task_2_output}"
        # Task 6: Sử dụng predict từ task3, task4, task5
        if task == 6:
            task_3_output = train_data[f"{id}_3"]["text_predict"]
            task_4_output = train_data[f"{id}_4"]["text_predict"]
            task_5_output = train_data[f"{id}_5"]["text_predict"]
            text_input = f"Task 6, predict aspect-opinion sets with results of task 3, 4, 5\nInput: {text_sentence}\nTask 3 results: {task_3_output}\nTask 4 results: {task_4_output}\nTask 5 results: {task_5_output}"
        # Task 7: Sử dụng predict từ task6
        if task == 7:
            task_6_output = train_data[f"{id}_6"]["text_predict"]
            text_input = f"Task 7, predict aspect-opinion-sentiment-category sets with these aspect-opinion sets\nInput: {text_sentence}\nAspect-opinion sets: {task_6_output}"
        
        # Tokenize inputs and labels
        text_inputs.append(text_input)
        text_labels.append(item_data["text_label"])
    
    # Tokenize the entire batch
    tokenized_inputs = tokenizer(text_inputs, padding=True, return_tensors="pt")
    tokenized_labels = tokenizer(text_labels, padding=True, return_tensors="pt")
    tokenized_labels["input_ids"][tokenized_labels["input_ids"] == tokenizer.pad_token_id] = -100
    # Return the batch as a dictionary
    return {
        "batch_keys": batch_keys,
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": tokenized_labels["input_ids"],
    }

def build_dataloader(train_data, batch_size, tokenizer):
    return DataLoader(
        list(train_data.keys()),
        batch_size=batch_size,
        shuffle=True,  # Shuffle để tăng tính ngẫu nhiên
        collate_fn=lambda batch_keys: collate_fn(batch_keys, train_data, tokenizer)
    )

In [5]:
# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# dataloader = build_dataloader(train_data, 2, tokenizer)
# for batch in dataloader:
#     # print(batch)
#     print(tokenizer.decode(batch['input_ids'][0]))
#     break

In [6]:
# dataloader = build_dataloader(train_data, 0.5, 1, T5Tokenizer.from_pretrained("t5-base"))
# max_len_input_ids = 0
# max_len_labels = 0
# for batch in dataloader:
#     if max_len_input_ids < len(batch['input_ids'][0]):
#         max_len_input_ids = len(batch['input_ids'][0])
    
#     if max_len_labels < len(batch['labels'][0]):
#         max_len_labels = len(batch['labels'][0])
# print(max_len_input_ids, max_len_labels)

# xử lý tập validation và đánh giá mô hình

In [12]:
# === HÀM XỬ LÝ VALIDATION DATA ===
def process_validation_data(validation_file):
    # Đọc dữ liệu từ file
    with open(validation_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
    data = []
    for line in lines:
        text_sentence, aspect_category_sentiment_opinions_str = line.strip().split("####")
        data.append({
            "text_sentence": text_sentence, 
            "aspect_category_sentiment_opinions": ast.literal_eval(aspect_category_sentiment_opinions_str)
        })
    return data

# === HÀM ĐỂ CHẠY SUY LUẬN TRÊN BATCH ===
def batch_predict(model, tokenizer, inputs, device):
    encoded_inputs = tokenizer(inputs, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(**encoded_inputs, max_length=124)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def evaluate_pipeline_batch(model, tokenizer, validation_data, device, batch_size):
    # Initialize counters
    count_true_positive = 0
    count_false_positive = 0
    count_false_negative = 0
    
    # Divide validation data into batches
    for i in range(0, len(validation_data), batch_size):
        batch = validation_data[i:i + batch_size]
        text_sentences = [sample["text_sentence"] for sample in batch]
        text_labels = [sample["aspect_category_sentiment_opinions"] for sample in batch]

        # Step 1: Task 1 (I -> A)
        task_1_inputs = [
            f"Task 1, predict aspects\nInput: {text_sentence}"
            for text_sentence in text_sentences]
        task_1_outputs = batch_predict(model, tokenizer, task_1_inputs, device)

        # Step 2: Task 2 (I -> O)
        task_2_inputs = [
            f"Task 2, predict opinions\nInput: {text_sentence}" 
            for text_sentence in text_sentences]
        task_2_outputs = batch_predict(model, tokenizer, task_2_inputs, device)

        # Step 3: Task 3 (I, A -> A-O)
        task_3_inputs = [
            f"Task 3, predict aspect-opinion sets with these aspects\nInput: {text_sentence}\nAspects: {task_1_output}"
            for text_sentence, task_1_output in zip(text_sentences, task_1_outputs)
        ]
        task_3_outputs = batch_predict(model, tokenizer, task_3_inputs, device)

        # Step 4: Task 4 (I, O -> A-O)
        task_4_inputs = [
            f"Task 4, predict aspect-opinion sets with these opinions\nInput: {text_sentence}\nOpinions: {task_2_output}"
            for text_sentence, task_2_output in zip(text_sentences, task_2_outputs)
        ]
        task_4_outputs = batch_predict(model, tokenizer, task_4_inputs, device)

        # Step 5: Task 5 (I, A, O -> A-O)
        task_5_inputs = [
            f"Task 5, predict aspect-opinion sets with these aspects and opinions\nInput: {text_sentence}\nAspects: {task_1_output}\nOpinions: {task_2_output}"
            for text_sentence, task_1_output, task_2_output in zip(text_sentences, task_1_outputs, task_2_outputs)
        ]
        task_5_outputs = batch_predict(model, tokenizer, task_5_inputs, device)

        # Step 6: Task 6 (I, A-O -> A-O)
        task_6_inputs = [
            f"Task 6, predict aspect-opinion sets with results of task 3, 4, 5\nInput: {text_sentence}\nTask 3 results: {task_3_output}\nTask 4 results: {task_4_output}\nTask 5 results: {task_5_output}"
            for text_sentence, task_3_output, task_4_output, task_5_output in zip(text_sentences, task_3_outputs, task_4_outputs, task_5_outputs)
        ]
        task_6_outputs = batch_predict(model, tokenizer, task_6_inputs, device)

        # Step 7: Task 7 (I, A-O -> A-O-S)
        task_7_inputs = [
            f"Task 7, predict aspect-opinion-sentiment-category sets with these aspect-opinion sets\nInput: {text_sentence}\nAspect-opinion sets: {task_6_output}"
            for text_sentence, task_6_output in zip(text_sentences, task_6_outputs)
        ]
        task_7_outputs = batch_predict(model, tokenizer, task_7_inputs, device)

        # Evaluate predictions
        for task_7_output, text_label in zip(task_7_outputs, text_labels):
            predicted_set = set(task_7_output.split('; '))  # Split predictions into sets of tuples
            true_set = set(f"{aspect}, {opinion}, {sentiment}, {category}" for aspect, category, sentiment, opinion in text_label) # Split ground truth into sets of tuples
            print('predicted_set =', predicted_set)
            print('true_set =', true_set)
            # Calculate true positives, false positives, and false negatives
            count_true_positive += len(predicted_set & true_set)
            count_false_positive += len(predicted_set - true_set)
            count_false_negative += len(true_set - predicted_set)
    
    # Compute Precision, Recall, and F1-score
    precision = count_true_positive / (count_true_positive + count_false_positive) if count_true_positive + count_false_positive > 0 else 0
    recall = count_true_positive / (count_true_positive + count_false_negative) if count_true_positive + count_false_negative > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return precision, recall, f1

# Đánh giá trên tập test

In [13]:
validation_data = process_validation_data("/chain-of-thought-ABSA/data/acos/laptop16/test.txt")

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("/chain-of-thought-ABSA/results/experiment_7/13.pt"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
evaluate_pipeline_batch(
    model=model, 
    tokenizer = T5Tokenizer.from_pretrained("t5-base"), 
    validation_data = validation_data, 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 
    batch_size = 16)

  model.load_state_dict(torch.load("/chain-of-thought-ABSA/results/experiment_7/13.pt"))


predicted_set = {'unit, not worth, negative, laptop price'}
true_set = {'unit, not worth, negative, laptop price'}
predicted_set = {'NULL, difficult, negative, laptop design_features'}
true_set = {'acer 11, small, negative, laptop design_features', 'acer 11, difficult, negative, laptop design_features'}
predicted_set = {'emmc, NULL, neutral, hard_disc general'}
true_set = {'ssd drive, NULL, negative, hard_disc operation_performance'}
predicted_set = {'computer, difficulty, negative, laptop usability'}
true_set = {'computer, difficulty, negative, laptop usability'}
predicted_set = {'chrome os, NULL, neutral, os general'}
true_set = {'chrome os, NULL, neutral, os general'}
predicted_set = {'chromebook, NULL, negative, laptop operation_performance'}
true_set = {'chrome, NULL, negative, os operation_performance'}
predicted_set = {'NULL, NULL, positive, laptop general'}
true_set = {'NULL, suck, negative, laptop general', 'NULL, badly, negative, laptop general'}
predicted_set = {'NULL, NULL,

(0.4355716878402904, 0.41522491349480967, 0.4251550044286979)

# Huấn luyện

In [9]:
# Hàm huấn luyện
def train_model(model, tokenizer, train_data, validation_data, num_epochs, batch_size, lr):
    no_improve_epochs = 0
    best_f1 = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    dataloader = build_dataloader(train_data, batch_size, tokenizer)
    optimizer = AdamW(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        torch.cuda.empty_cache()
        # Huấn luyện #########################################################################################################################################
        print(f"Training epoch {epoch + 1}")
        model.train()
        total_loss = 0
        for batch in dataloader:
            outputs = model(input_ids=batch["input_ids"].to(device), attention_mask=batch["attention_mask"].to(device), labels=batch["labels"].to(device))
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            # Lưu dự đoán vào train_data
            if epoch > 0:
                for key, sequence in zip(batch["batch_keys"], torch.argmax(outputs.logits, dim=-1)):
                    eos_position = (sequence == 1).nonzero(as_tuple=True)[0]
                    if len(eos_position) > 0:  # Nếu tìm thấy <EOS>
                        train_data[key]["text_predict"] = tokenizer.decode(sequence[: eos_position[0]], skip_special_tokens=True)
                    else:
                        train_data[key]["text_predict"] = tokenizer.decode(sequence, skip_special_tokens=True) # Không có <EOS>, giữ nguyên
                    
        print(f"Epoch {epoch + 1} finished with loss: {total_loss / len(dataloader):.4f}. Evaluating")
        # Đánh giá #########################################################################################################################################
        model.eval()
        precision, recall, f1 = evaluate_pipeline_batch(model, tokenizer, validation_data, device, batch_size)
        print("precision =", precision,"recall =", recall,"f1 =", f1)
        if(f1 > best_f1):
            no_improve_epochs = 0
            best_f1 = f1
            # Lưu model
            torch.save(model.state_dict(), f"/chain-of-thought-ABSA/results/experiment_7/{epoch + 1}.pt")
            print("New best_f1 =", best_f1)
        else:
            no_improve_epochs = no_improve_epochs + 1
            if no_improve_epochs == 10:
                print("Huấn huyện xong")
                break

In [10]:
train_data = process_train_data("/chain-of-thought-ABSA/data/acos/laptop16/train.txt")

train_model(
    model = T5ForConditionalGeneration.from_pretrained("t5-base"), 
    tokenizer = T5Tokenizer.from_pretrained("t5-base"), 
    train_data = train_data,
    validation_data = process_validation_data("/chain-of-thought-ABSA/data/acos/laptop16/dev.txt"),
    num_epochs = 1000,
    batch_size = 16, 
    lr = 3e-5,
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Training epoch 1


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 finished with loss: 0.8915. Evaluating
precision = 0.2433734939759036 recall = 0.23006833712984054 f1 = 0.23653395784543327
New best_f1 = 0.23653395784543327
Training epoch 2
Epoch 2 finished with loss: 0.3465. Evaluating
precision = 0.3588516746411483 recall = 0.3416856492027335 f1 = 0.3500583430571762
New best_f1 = 0.3500583430571762
Training epoch 3
Epoch 3 finished with loss: 0.2219. Evaluating
precision = 0.4368932038834951 recall = 0.41002277904328016 f1 = 0.4230317273795534
New best_f1 = 0.4230317273795534
Training epoch 4
Epoch 4 finished with loss: 0.1586. Evaluating
precision = 0.4429223744292237 recall = 0.4419134396355353 f1 = 0.44241733181299886
New best_f1 = 0.44241733181299886
Training epoch 5
Epoch 5 finished with loss: 0.1224. Evaluating
precision = 0.4377880184331797 recall = 0.4328018223234624 f1 = 0.4352806414662085
Training epoch 6
Epoch 6 finished with loss: 0.0980. Evaluating
precision = 0.4583333333333333 recall = 0.4510250569476082 f1 = 0.45464982778415