# xây dựng tập huấn luyện

In [None]:
import ast
import random
from torch.utils.data import DataLoader
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

In [None]:
def process_train_data(file_path):
    # Đọc dữ liệu từ file
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    # Tạo các tác vụ
    task_data = {}
    for i in range(len(lines)):
        text_sentence, aspect_category_sentiment_opinions_str = lines[i].strip().split("####")
        aspect_category_sentiment_opinions = ast.literal_eval(aspect_category_sentiment_opinions_str)
        # Tạo các chuỗi cần thiết
        aspects_str = "; ".join(sorted(set([aspect for aspect, _, _, _ in aspect_category_sentiment_opinions])))
        opinions_str = "; ".join(sorted(set([opinion for _, _, _, opinion in aspect_category_sentiment_opinions])))
        aspect_opinions_str = "; ".join(sorted(set([f"{aspect}, {opinion}" for aspect, _, _, opinion in aspect_category_sentiment_opinions])))
        aspect_category_sentiment_opinions_str = "; ".join(sorted([f"{aspect}, {category}, {sentiment}, {opinion}" for aspect, category, sentiment, opinion in aspect_category_sentiment_opinions]))

        # Tác vụ 1: I -> A
        task_data[f"{i}_1"] = {
            "id": i,
            "task": 1,
            "text_input": f"Task 1, predict aspect terms\nInput: {text_sentence}",
            "text_label": aspects_str,
            "text_predict": aspects_str,
            "text_sentence": text_sentence,
        }
        # Tác vụ 2: I -> O
        task_data[f"{i}_2"] = {
            "id": i,
            "task": 2,
            "text_input": f"Task 2, predict opinion terms\nInput: {text_sentence}",
            "text_label": opinions_str,
            "text_predict": opinions_str,
            "text_sentence": text_sentence,
        }
        # Tác vụ 3: I, A -> A-O
        task_data[f"{i}_3"] = {
            "id": i,
            "task": 3,
            "text_input": f"Task 3, predict aspect term - opinion term sets with these aspect terms\nInput: {text_sentence}\nAspect terms: {aspects_str}",
            "text_label": aspect_opinions_str,
            "text_predict": aspect_opinions_str,
            "text_sentence": text_sentence,
        }
        # Tác vụ 4: I, O -> A-O
        task_data[f"{i}_4"] = {
            "id": i,
            "task": 4,
            "text_input": f"Task 4, predict aspect term - opinion term sets with these opinion terms\nInput: {text_sentence}\nOpinion terms: {opinions_str}",
            "text_label": aspect_opinions_str,
            "text_predict": aspect_opinions_str,
            "text_sentence": text_sentence,
        }
        # Tác vụ 5: I, A, O -> A-O
        task_data[f"{i}_5"] = {
            "id": i,
            "task": 5,
            "text_input": f"Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms\nInput: {text_sentence}\nAspect terms: {aspects_str}\nOpinion terms: {opinions_str}",
            "text_label": aspect_opinions_str,
            "text_predict": aspect_opinions_str,
            "text_sentence": text_sentence,
        }
        # Tác vụ 6: I, A-O, A-O, A-O -> A-O
        task_data[f"{i}_6"] = {
            "id": i,
            "task": 6,
            "text_input": f"Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5\nInput: {text_sentence}\nTask 3 results: {aspect_opinions_str}\nTask 4 results: {aspect_opinions_str}\nTask 5 results: {aspect_opinions_str}",
            "text_label": aspect_opinions_str,
            "text_predict": aspect_opinions_str,
            "text_sentence": text_sentence,
        }
        # Tác vụ 7: I, A-O -> A-C-S-O
        task_data[f"{i}_7"] = {
            "id": i,
            "task": 7,
            "text_input": f"Task 7, predict aspect term - aspect category - sentiment polarity - opinion term sets with these aspect term - opinion term sets\nInput: {text_sentence}\nAspect term - opinion term sets: {aspect_opinions_str}",
            "text_label": aspect_category_sentiment_opinions_str,
            "text_predict": aspect_category_sentiment_opinions_str,
            "text_sentence": text_sentence,
        }
    return task_data

In [None]:
# Đường dẫn đến file
# train_data = process_train_data("/chain-of-thought-ABSA/data/acos/rest16/train.txt")
# train_data

In [None]:
def collate_fn(batch_keys, train_data, q, tokenizer):
    text_inputs, text_labels = [], []
    for key in batch_keys:
        item_data = train_data[key]

        id = item_data["id"]
        task = item_data['task']
        text_input = item_data["text_input"]
        text_label = item_data["text_label"]
        # text_predict = item_data["text_predict"]
        text_sentence = item_data["text_sentence"]

        # Task 3: Sử dụng predict từ task1
        if task == 3 and random.random() < q:
            aspects = train_data[f"{id}_1"]["text_predict"]
            text_input = f"Task 3, predict aspect term - opinion term sets with these aspect terms\nInput: {text_sentence}\nAspect terms: {aspects}"
        # Task 4: Sử dụng predict từ task2
        if task == 4 and random.random() < q:
            opinions = train_data[f"{id}_2"]["text_predict"]
            text_input = f"Task 4, predict aspect term - opinion term sets with these opinion terms\nInput: {text_sentence}\nOpinion terms: {opinions}"
        # Task 5: Sử dụng predict từ task1 và task2
        if task == 5:
            aspects = train_data[f"{id}_1"]["text_predict"] if random.random() < q else train_data[f"{id}_1"]["text_label"]
            opinions = train_data[f"{id}_2"]["text_predict"] if random.random() < q else train_data[f"{id}_2"]["text_label"]
            text_input = f"Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms\nInput: {text_sentence}\nAspect terms: {aspects}\nOpinion terms: {opinions}"
        # Task 6: Sử dụng predict từ task3, task4, task5
        if task == 6:
            result_3 = train_data[f"{id}_3"]["text_predict"] if random.random() < q else train_data[f"{id}_3"]["text_label"]
            result_4 = train_data[f"{id}_4"]["text_predict"] if random.random() < q else train_data[f"{id}_4"]["text_label"]
            result_5 = train_data[f"{id}_5"]["text_predict"] if random.random() < q else train_data[f"{id}_5"]["text_label"]
            text_input = f"Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5\nInput: {text_sentence}\nTask 3 results: {result_3}\nTask 4 results: {result_4}\nTask 5 results: {result_5}"
        # Task 7: Sử dụng predict từ task6
        if task == 7 and random.random() < q:
            aspect_opinions = train_data[f"{id}_6"]["text_predict"]
            text_input = f"Task 7, predict aspect term - aspect category - sentiment polarity - opinion term sets with these aspect term - opinion term sets\nInput: {text_sentence}\nAspect term - opinion term sets: {aspect_opinions}"
        
        # Tokenize inputs and labels
        text_inputs.append(text_input)
        text_labels.append(text_label)
    
    # Tokenize the entire batch
    tokenized_inputs = tokenizer(text_inputs, padding=True, return_tensors="pt")
    tokenized_labels = tokenizer(text_labels, padding=True, return_tensors="pt")
    tokenized_labels["input_ids"][tokenized_labels["input_ids"] == tokenizer.pad_token_id] = -100
    # Return the batch as a dictionary
    return {
        "batch_keys": batch_keys,
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": tokenized_labels["input_ids"],
        # "text_inputs": text_inputs,
        # "text_labels": text_labels,
    }

def build_dataloader(train_data, q, batch_size, tokenizer):
    return DataLoader(
        list(train_data.keys()),
        batch_size=batch_size,
        shuffle=True,  # Shuffle để tăng tính ngẫu nhiên
        collate_fn=lambda batch_keys: collate_fn(batch_keys=batch_keys, train_data=train_data, q=q, tokenizer=tokenizer)
    )

In [None]:
# dataloader = build_dataloader(train_data, 0.5, 1, T5Tokenizer.from_pretrained("t5-base"))
# max_len_input_ids = 0
# max_len_labels = 0
# for batch in dataloader:
#     if max_len_input_ids < len(batch['input_ids'][0]):
#         max_len_input_ids = len(batch['input_ids'][0])
    
#     if max_len_labels < len(batch['labels'][0]):
#         max_len_labels = len(batch['labels'][0])
# print(max_len_input_ids, max_len_labels)

# xử lý tập validation và đánh giá mô hình

In [None]:
# === HÀM XỬ LÝ VALIDATION DATA ===
def process_validation_data(validation_file):
    # Đọc dữ liệu từ file
    with open(validation_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
    data = []
    for line in lines:
        text_sentence, aspect_category_sentiment_opinions_str = line.strip().split("####")
        data.append({
            "text_sentence": text_sentence, 
            "aspect_category_sentiment_opinions": ast.literal_eval(aspect_category_sentiment_opinions_str)
        })
    return data

# === HÀM ĐỂ CHẠY SUY LUẬN TRÊN BATCH ===
def batch_predict(model, tokenizer, inputs, device):
    encoded_inputs = tokenizer(inputs, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(**encoded_inputs, max_length=124)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def evaluate_pipeline_batch(model, tokenizer, validation_data, device, batch_size):
    # Initialize counters
    count_true_positive = 0
    count_false_positive = 0
    count_false_negative = 0
    
    # Divide validation data into batches
    for i in range(0, len(validation_data), batch_size):
        batch = validation_data[i:i + batch_size]
        text_sentences = [sample["text_sentence"] for sample in batch]
        text_labels = [sample["aspect_category_sentiment_opinions"] for sample in batch]

        # Step 1: Task 1 (I -> A)
        task_1_inputs = [
            f"Task 1, predict aspect terms\nInput: {text_sentence}"
            for text_sentence in text_sentences]
        task_1_outputs = batch_predict(model, tokenizer, task_1_inputs, device)
        # print("task_1_inputs = ", task_1_inputs)
        # print("task_1_outputs = ", task_1_outputs)
        # Step 2: Task 2 (I -> O)
        task_2_inputs = [
            f"Task 2, predict opinion terms\nInput: {text_sentence}" 
            for text_sentence in text_sentences]
        task_2_outputs = batch_predict(model, tokenizer, task_2_inputs, device)
        # print("task_2_inputs = ", task_2_inputs)
        # print("task_2_outputs = ", task_2_outputs)
        # Step 3: Task 3 (I, A -> A-O)
        task_3_inputs = [
            f"Task 3, predict aspect term - opinion term sets with these aspect terms\nInput: {text_sentence}\nAspect terms: {task_1_output}"
            for text_sentence, task_1_output in zip(text_sentences, task_1_outputs)
        ]
        task_3_outputs = batch_predict(model, tokenizer, task_3_inputs, device)
        # print("task_3_inputs = ", task_3_inputs)
        # print("task_3_outputs = ", task_3_outputs)
        # Step 4: Task 4 (I, O -> A-O)
        task_4_inputs = [
            f"Task 4, predict aspect term - opinion term sets with these opinion terms\nInput: {text_sentence}\nOpinion terms: {task_2_output}"
            for text_sentence, task_2_output in zip(text_sentences, task_2_outputs)
        ]
        task_4_outputs = batch_predict(model, tokenizer, task_4_inputs, device)
        # print("task_4_inputs = ", task_4_inputs)
        # print("task_4_outputs = ", task_4_outputs)
        # Step 5: Task 5 (I, A, O -> A-O)
        task_5_inputs = [
            f"Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms\nInput: {text_sentence}\nAspect terms: {task_1_output}\nOpinion terms: {task_2_output}"
            for text_sentence, task_1_output, task_2_output in zip(text_sentences, task_1_outputs, task_2_outputs)
        ]
        task_5_outputs = batch_predict(model, tokenizer, task_5_inputs, device)
        # print("task_5_inputs = ", task_5_inputs)
        # print("task_5_outputs = ", task_5_outputs)
        # Step 6: Task 6 (I, A-O -> A-O)
        task_6_inputs = [
            f"Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5\nInput: {text_sentence}\nTask 3 results: {task_3_output}\nTask 4 results: {task_4_output}\nTask 5 results: {task_5_output}"
            for text_sentence, task_3_output, task_4_output, task_5_output in zip(text_sentences, task_3_outputs, task_4_outputs, task_5_outputs)
        ]
        task_6_outputs = batch_predict(model, tokenizer, task_6_inputs, device)
        # print("task_6_inputs = ", task_6_inputs)
        # print("task_6_outputs = ", task_6_outputs)
        # Step 7: Task 7 (I, A-O -> A-O-S)
        task_7_inputs = [
            f"Task 7, predict aspect term - aspect category - sentiment polarity - opinion term sets with these aspect term - opinion term sets\nInput: {text_sentence}\nAspect term - opinion term sets: {task_6_output}"
            for text_sentence, task_6_output in zip(text_sentences, task_6_outputs)
        ]
        task_7_outputs = batch_predict(model, tokenizer, task_7_inputs, device)
        # print("task_7_inputs = ", task_7_inputs)
        # print("task_7_outputs = ", task_7_outputs)
        # print("text_labels = ", text_labels)
        # return 0, 0, 0
        # Evaluate predictions
        for task_7_output, text_label in zip(task_7_outputs, text_labels):
            predicted_set = set(task_7_output.split('; '))  # Split predictions into sets of tuples
            true_set = set(f"{aspect}, {category}, {sentiment}, {opinion}" for aspect, category, sentiment, opinion in text_label) # Split ground truth into sets of tuples
            # Calculate true positives, false positives, and false negatives
            count_true_positive += len(predicted_set & true_set)
            count_false_positive += len(predicted_set - true_set)
            count_false_negative += len(true_set - predicted_set)
    
    # Compute Precision, Recall, and F1-score
    precision = count_true_positive / (count_true_positive + count_false_positive) if count_true_positive + count_false_positive > 0 else 0
    recall = count_true_positive / (count_true_positive + count_false_negative) if count_true_positive + count_false_negative > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return precision, recall, f1

# Đánh giá trên tập test

In [None]:
validation_data = process_validation_data("/chain-of-thought-ABSA/data/acos/laptop16/test.txt")

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("/chain-of-thought-ABSA/results/experiment_4/17.pt"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
evaluate_pipeline_batch(
    model=model, 
    tokenizer = T5Tokenizer.from_pretrained("t5-base"), 
    validation_data = validation_data, 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 
    batch_size = 16)

# Huấn luyện

In [None]:
def remove_after_eos(output_ids):
    cleaned_output = []
    for sequence in output_ids:
        eos_position = (sequence == 1).nonzero(as_tuple=True)[0]
        if len(eos_position) > 0:  # Nếu tìm thấy <EOS>
            cleaned_output.append(sequence[: eos_position[0]])  # Giữ lại từ đầu đến trước <EOS>
        else:
            cleaned_output.append(sequence)  # Không có <EOS>, giữ nguyên
    return cleaned_output

# Hàm huấn luyện
def train_model(model, tokenizer, train_data, validation_data, num_epochs, batch_size, lr, q_step):
    # no_improve_epochs = 0
    q = 0
    best_f1 = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    dataloader = build_dataloader(train_data=train_data, q = q, batch_size=batch_size, tokenizer=tokenizer)
    optimizer = AdamW(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        torch.cuda.empty_cache()
        # Huấn luyện #########################################################################################################################################
        print(f"Training epoch {epoch + 1}")
        model.train()
        total_loss = 0
        for batch in dataloader:
            outputs = model(
                input_ids=batch["input_ids"].to(device), 
                attention_mask=batch["attention_mask"].to(device), 
                labels=batch["labels"].to(device))
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            # Lưu dự đoán vào train_data
            if random.random() < q:
                output_ids = torch.argmax(outputs.logits, dim=-1)
                output_ids = remove_after_eos(output_ids)
                for key, prediction in zip(batch["batch_keys"], tokenizer.batch_decode(output_ids, skip_special_tokens=True)):
                    train_data[key]["text_predict"] = prediction
                    
        print(f"Epoch {epoch + 1} finished with loss: {total_loss / len(dataloader):.4f}")
        # Đánh giá #########################################################################################################################################
        print("Evaluating")
        model.eval()
        precision, recall, f1 = evaluate_pipeline_batch(model, tokenizer, validation_data, device, batch_size)
        print("precision =", precision,"recall =", recall,"f1 =", f1)
        if(f1 > best_f1):
            # no_improve_epochs = 0
            best_f1 = f1
            # Lưu model
            torch.save(model.state_dict(), f"/chain-of-thought-ABSA/results/experiment_4/{epoch + 1}.pt")
            print("New best_f1 =", best_f1)
        else:
            if (q >= 1):
                print("Huấn huyện xong")
                break
            q = q + q_step
            print("q =", q)
            dataloader = build_dataloader(train_data=train_data, q = q, batch_size=batch_size, tokenizer=tokenizer)
            # no_improve_epochs = no_improve_epochs + 1
            # if no_improve_epochs == 5:
            #     print("Huấn huyện xong")
            #     break

In [None]:
train_data = process_train_data("/chain-of-thought-ABSA/data/acos/laptop16/train.txt")

train_model(
    model = T5ForConditionalGeneration.from_pretrained("t5-base"), 
    tokenizer = T5Tokenizer.from_pretrained("t5-base"), 
    train_data = train_data,
    validation_data = process_validation_data("/chain-of-thought-ABSA/data/acos/laptop16/dev.txt"),
    num_epochs = 1000,
    batch_size = 16, 
    lr = 3e-5,
    q_step=0.05,
)