# xây dựng tập huấn luyện

In [None]:
# Xây dựng tập real_train_data
import json

# Đọc dữ liệu từ file JSON
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# Tạo các tác vụ
def create_tasks(data):
    task_data = {}
    for i, sample in enumerate(data):  # Sử dụng chỉ số i để phân biệt các câu
        aspect_opinion_sentiments = []
        for index in range(len(sample["aspects"])):
            aspect = " ".join(sample["aspects"][index]["term"])
            opinion = " ".join(sample["opinions"][index]["term"])
            sentiment = sample["aspects"][index]["polarity"]
            aspect_opinion_sentiments.append((aspect, opinion, sentiment))

        aspects = "; ".join(sorted([aspect for aspect, _, _ in aspect_opinion_sentiments]))
        opinions = "; ".join(sorted([opinion for _, opinion, _ in aspect_opinion_sentiments]))
        aspect_opinions = "; ".join(sorted([f"{aspect}, {opinion}" for aspect, opinion, _ in aspect_opinion_sentiments]))
        aspect_opinion_sentiments = "; ".join(sorted([f"{aspect}, {opinion}, {sentiment}" for aspect, opinion, sentiment in aspect_opinion_sentiments]))
        raw_text = sample["raw_words"]
        # Tác vụ 1: I -> A
        task_data[f"{i}_1"] = {
            "id": i,
            "task": 1,
            "text_input": f"Task 1, predict aspect terms\nInput: {raw_text}",
            "text_label": aspects,
            "text_predict": aspects,
            "text_sentence": raw_text,
        }
        # Tác vụ 2: I -> O
        task_data[f"{i}_2"] = {
            "id": i,
            "task": 2,
            "text_input": f"Task 2, predict opinion terms\nInput: {raw_text}",
            "text_label": opinions,
            "text_predict": opinions,
            "text_sentence": raw_text,
        }
        # Tác vụ 3: I, A -> A-O
        task_data[f"{i}_3"] = {
            "id": i,
            "task": 3,
            "text_input": f"Task 3, predict aspect term - opinion term sets with these aspect terms\nInput: {raw_text}\nAspect terms: {aspects}",
            "text_label": aspect_opinions,
            "text_predict": aspect_opinions,
            "text_sentence": raw_text,
        }
        # Tác vụ 4: I, O -> A-O
        task_data[f"{i}_4"] = {
            "id": i,
            "task": 4,
            "text_input": f"Task 4, predict aspect term - opinion term sets with these opinion terms\nInput: {raw_text}\nOpinion terms: {opinions}",
            "text_label": aspect_opinions,
            "text_predict": aspect_opinions,
            "text_sentence": raw_text,
        }
        # Tác vụ 5: I, A, O -> A-O
        task_data[f"{i}_5"] = {
            "id": i,
            "task": 5,
            "text_input": f"Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms\nInput: {raw_text}\nAspect terms: {aspects}\nOpinion terms: {opinions}",
            "text_label": aspect_opinions,
            "text_predict": aspect_opinions,
            "text_sentence": raw_text,
        }
        # Tác vụ 6: I, A-O, A-O, A-O -> A-O
        task_data[f"{i}_6"] = {
            "id": i,
            "task": 6,
            "text_input": f"Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5\nInput: {raw_text}\nTask 3 results: {aspect_opinions}\nTask 4 results: {aspect_opinions}\nTask 5 results: {aspect_opinions}",
            "text_label": aspect_opinions,
            "text_predict": aspect_opinions,
            "text_sentence": raw_text,
        }
        # Tác vụ 7: I, A-O -> A-O-S
        task_data[f"{i}_7"] = {
            "id": i,
            "task": 7,
            "text_input": f"Task 7, predict aspect term - opinion term - sentiment polarity sets with these aspect term - opinion term sets\nInput: {raw_text}\nAspect term - opinion term sets: {aspect_opinions}",
            "text_label": aspect_opinion_sentiments,
            "text_predict": aspect_opinion_sentiments,
            "text_sentence": raw_text,
        }
    return task_data

In [None]:
import random
from torch.utils.data import DataLoader

def collate_fn(batch_keys, train_tasks, q, tokenizer):
    text_inputs, text_labels = [], []
    for key in batch_keys:
        item_data = train_tasks[key]

        id = item_data["id"]
        task = item_data['task']
        text_input = item_data["text_input"]
        text_label = item_data["text_label"]
        # text_predict = item_data["text_predict"]
        text_sentence = item_data["text_sentence"]

        # Task 3: Sử dụng predict từ task1
        if task == 3 and random.random() < q:
            aspects = train_tasks[f"{id}_1"]["text_predict"]
            text_input = f"Task 3, predict aspect term - opinion term sets with these aspect terms\nInput: {text_sentence}\nAspect terms: {aspects}"
        # Task 4: Sử dụng predict từ task2
        if task == 4 and random.random() < q:
            opinions = train_tasks[f"{id}_2"]["text_predict"]
            text_input = f"Task 4, predict aspect term - opinion term sets with these opinion terms\nInput: {text_sentence}\nOpinion terms: {opinions}"
        # Task 5: Sử dụng predict từ task1 và task2
        if task == 5:
            aspects = train_tasks[f"{id}_1"]["text_predict"] if random.random() < q else train_tasks[f"{id}_1"]["text_label"]
            opinions = train_tasks[f"{id}_2"]["text_predict"] if random.random() < q else train_tasks[f"{id}_2"]["text_label"]
            text_input = f"Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms\nInput: {text_sentence}\nAspect terms: {aspects}\nOpinion terms: {opinions}"
        # Task 6: Sử dụng predict từ task3, task4, task5
        if task == 6:
            result_3 = train_tasks[f"{id}_3"]["text_predict"] if random.random() < q else train_tasks[f"{id}_3"]["text_label"]
            result_4 = train_tasks[f"{id}_4"]["text_predict"] if random.random() < q else train_tasks[f"{id}_4"]["text_label"]
            result_5 = train_tasks[f"{id}_5"]["text_predict"] if random.random() < q else train_tasks[f"{id}_5"]["text_label"]
            text_input = f"Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5\nInput: {text_sentence}\nTask 3 results: {result_3}\nTask 4 results: {result_4}\nTask 5 results: {result_5}"
        # Task 7: Sử dụng predict từ task6
        if task == 7 and random.random() < q:
            aspect_opinions = train_tasks[f"{id}_6"]["text_predict"]
            text_input = f"Task 7, predict aspect term - opinion term - sentiment polarity sets with these aspect term - opinion term sets\nInput: {text_sentence}\nAspect term - opinion term sets: {aspect_opinions}"
        
        # Tokenize inputs and labels
        text_inputs.append(text_input)
        text_labels.append(text_label)
    
    # Tokenize the entire batch
    tokenized_inputs = tokenizer(text_inputs, padding=True, return_tensors="pt")
    tokenized_labels = tokenizer(text_labels, padding=True, return_tensors="pt")

    # Return the batch as a dictionary
    return {
        "batch_keys": batch_keys,
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": tokenized_labels["input_ids"],
    }

def build_dataloader(train_tasks, q, batch_size, tokenizer):
    # Tạo danh sách các key để chia batch
    task_keys = list(train_tasks.keys())
    # Tạo DataLoader
    return DataLoader(
        task_keys,
        batch_size=batch_size,
        shuffle=True,  # Shuffle để tăng tính ngẫu nhiên
        collate_fn=lambda batch_keys: collate_fn(batch_keys=batch_keys, train_tasks=train_tasks, q=q, tokenizer=tokenizer)
    )

# xử lý tập validation và đánh giá mô hình

In [32]:
import json

# === HÀM ĐỂ CHẠY SUY LUẬN TRÊN BATCH ===
def batch_predict(model, tokenizer, inputs, device):
    encoded_inputs = tokenizer(inputs, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(**encoded_inputs, max_length=100)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def evaluate_pipeline_batch(model, tokenizer, validation_data, device, batch_size):
    # Initialize counters
    count_true_positive = 0
    count_false_positive = 0
    count_false_negative = 0
    
    # Divide validation data into batches
    for i in range(0, len(validation_data), batch_size):
        batch = validation_data[i:i + batch_size]
        raw_texts = [sample["text_input"] for sample in batch]
        ground_truth_batch = [sample["text_label"] for sample in batch]

        # Step 1: Task 1 (I -> A)
        task_1_inputs = [
            f"Task 1, predict aspect terms\nInput: {text}"
            for text in raw_texts]
        task_1_outputs = batch_predict(model, tokenizer, task_1_inputs, device)
        # print("task_1_inputs = ", task_1_inputs)
        # print("task_1_outputs = ", task_1_outputs)
        # Step 2: Task 2 (I -> O)
        task_2_inputs = [
            f"Task 2, predict opinion terms\nInput: {text}" 
            for text in raw_texts]
        task_2_outputs = batch_predict(model, tokenizer, task_2_inputs, device)
        # print("task_2_inputs = ", task_2_inputs)
        # print("task_2_outputs = ", task_2_outputs)
        # Step 3: Task 3 (I, A -> A-O)
        task_3_inputs = [
            f"Task 3, predict aspect term - opinion term sets with these aspect terms\nInput: {text}\nAspect terms: {aspect}"
            for text, aspect in zip(raw_texts, task_1_outputs)
        ]
        task_3_outputs = batch_predict(model, tokenizer, task_3_inputs, device)
        # print("task_3_inputs = ", task_3_inputs)
        # print("task_3_outputs = ", task_3_outputs)
        # Step 4: Task 4 (I, O -> A-O)
        task_4_inputs = [
            f"Task 4, predict aspect term - opinion term sets with these opinion terms\nInput: {text}\nOpinion terms: {opinion}"
            for text, opinion in zip(raw_texts, task_2_outputs)
        ]
        task_4_outputs = batch_predict(model, tokenizer, task_4_inputs, device)
        # print("task_4_inputs = ", task_4_inputs)
        # print("task_4_outputs = ", task_4_outputs)
        # Step 5: Task 5 (I, A, O -> A-O)
        task_5_inputs = [
            f"Task 5, predict aspect term - opinion term sets with these aspect terms and opinion terms\nInput: {text}\nAspect terms: {aspect}\nOpinion terms: {opinion}"
            for text, aspect, opinion in zip(raw_texts, task_1_outputs, task_2_outputs)
        ]
        task_5_outputs = batch_predict(model, tokenizer, task_5_inputs, device)
        # print("task_5_inputs = ", task_5_inputs)
        # print("task_5_outputs = ", task_5_outputs)
        # Step 6: Task 6 (I, A-O -> A-O)
        task_6_inputs = [
            f"Task 6, predict aspect term - opinion term sets with results of task 3, 4, 5\nInput: {text}\nTask 3 results: {task_3}\nTask 4 results: {task_4}\nTask 5 results: {task_5}"
            for text, task_3, task_4, task_5 in zip(raw_texts, task_3_outputs, task_4_outputs, task_5_outputs)
        ]
        task_6_outputs = batch_predict(model, tokenizer, task_6_inputs, device)
        # print("task_6_inputs = ", task_6_inputs)
        # print("task_6_outputs = ", task_6_outputs)
        # Step 7: Task 7 (I, A-O -> A-O-S)
        task_7_inputs = [
            f"Task 7, predict aspect term - opinion term - sentiment polarity sets with these aspect term - opinion term sets\nInput: {text}\nAspect term - opinion term sets: {task_6_output}"
            for text, task_6_output in zip(raw_texts, task_6_outputs)
        ]
        task_7_outputs = batch_predict(model, tokenizer, task_7_inputs, device)
        # print("task_7_inputs = ", task_7_inputs)
        # print("task_7_outputs = ", task_7_outputs)
        # print("ground_truth_batch = ", ground_truth_batch)
        # return 0, 0, 0
        # Evaluate predictions
        for predicted, true in zip(task_7_outputs, ground_truth_batch):
            predicted_set = set(predicted.split('; '))  # Split predictions into sets of tuples
            true_set = set(true.split('; '))  # Split ground truth into sets of tuples
            # Calculate true positives, false positives, and false negatives
            count_true_positive += len(predicted_set & true_set)
            count_false_positive += len(predicted_set - true_set)
            count_false_negative += len(true_set - predicted_set)
    
    # Compute Precision, Recall, and F1-score
    precision = count_true_positive / (count_true_positive + count_false_positive) if count_true_positive + count_false_positive > 0 else 0
    recall = count_true_positive / (count_true_positive + count_false_negative) if count_true_positive + count_false_negative > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return precision, recall, f1

# === HÀM XỬ LÝ VALIDATION DATA ===
def process_validation_data(validation_file):
    with open(validation_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    processed_data = []
    for sample in data:
        aspect_opinion_sentiments = []
        for index in range(len(sample["aspects"])):
            aspect = " ".join(sample["aspects"][index]["term"])
            opinion = " ".join(sample["opinions"][index]["term"])
            sentiment = sample["aspects"][index]["polarity"]
            aspect_opinion_sentiments.append((aspect, opinion, sentiment))
        # Add processed sample
        processed_data.append({
            "text_input": sample["raw_words"],
            "text_label": "; ".join(sorted([f"{aspect}, {opinion}, {sentiment}" for aspect, opinion, sentiment in aspect_opinion_sentiments]))
        })
    # random.shuffle(processed_data)
    return processed_data

In [33]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

validation_data = process_validation_data("/chain-of-thought-ABSA/Dataset/SemEval14/Validation/Laptops_Opinion_Validation.json")

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("/chain-of-thought-ABSA/results/32.pt"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
evaluate_pipeline_batch(
    model=model, 
    tokenizer = T5Tokenizer.from_pretrained("t5-base"), 
    validation_data = validation_data, 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 
    batch_size = 16)

  model.load_state_dict(torch.load("/chain-of-thought-ABSA/results/32.pt"))


(0.5782122905027933, 0.6, 0.5889046941678521)

# Huấn luyện

In [None]:
import torch
from transformers import AdamW
# Hàm huấn luyện
def train_model(model, tokenizer, train_tasks, validation_data, num_epochs, batch_size, lr, q_sped):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    q = 0
    dataloader = build_dataloader(train_tasks=train_tasks, q = q, batch_size=batch_size, tokenizer=tokenizer)
    optimizer = AdamW(model.parameters(), lr=lr)
    previous_f1 = 0
    for epoch in range(num_epochs):
        torch.cuda.empty_cache()
        print(f"Epoch {epoch + 1}/{num_epochs}")
        #Huấn luyện
        print("Training")
        model.train()
        total_loss = 0
        for batch in dataloader:
            batch_keys = batch["batch_keys"]
            # Reset gradients
            optimizer.zero_grad()
            # Forward pass
            outputs = model(
                input_ids=batch["input_ids"].to(device), 
                attention_mask=batch["attention_mask"].to(device), 
                labels=batch["labels"].to(device))
            loss = outputs.loss
            logits = outputs.logits
            # Backward pass
            loss.backward()
            optimizer.step()
            # add loss
            total_loss += loss.item()
            # Lưu dự đoán vào train_tasks nếu random.random() < p
            if random.random() < (1 - q):
                for key, prediction in zip(batch_keys, tokenizer.batch_decode(torch.argmax(logits, dim=-1), skip_special_tokens=True)):
                    train_tasks[key]["text_predict"] = prediction
        
        # Log loss mỗi epoch
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1} finished with loss: {avg_loss:.4f}")

        # Đánh giá
        print("Evaluating")
        model.eval()
        precision, recall, f1 = evaluate_pipeline_batch(model, tokenizer, validation_data, device, batch_size = batch_size)
        print(precision, recall, f1)
        if(f1 > previous_f1):
            previous_f1 = f1
            torch.save(model.state_dict(), f"/chain-of-thought-ABSA/results/{epoch + 1}.pt")
        else:
            if (q >= 1):
                return
            previous_f1 = 0
            q+=q_sped
            print(q)
            dataloader = build_dataloader(train_tasks=train_tasks, q = q, batch_size=batch_size, tokenizer=tokenizer)

In [None]:
# Tạo tập huấn luyện
train_data = load_data("/chain-of-thought-ABSA/Dataset/SemEval14/Train/Laptops_Opinion_Train.json")
train_tasks = create_tasks(train_data)
# tạo tập validation
validation_data = process_validation_data("/chain-of-thought-ABSA/Dataset/SemEval14/Validation/Laptops_Opinion_Validation.json")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

train_model(
    model = T5ForConditionalGeneration.from_pretrained("t5-base"), 
    tokenizer = T5Tokenizer.from_pretrained("t5-base"), 
    train_tasks = train_tasks,
    validation_data = validation_data,
    num_epochs = 1000,
    batch_size = 16, 
    lr = 3e-5,
    q_sped = 0.05)