In [2]:
from transformers import BertTokenizer, BertModel, BertConfig, get_linear_schedule_with_warmup
from datasets import load_dataset
from evaluate import load
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
import argparse
import os
from pathlib import Path
#  You can install and import any other libraries if needed

parser = argparse.ArgumentParser()
parser.add_argument("--ckpt_dir", type=str, required=True) 

args = parser.parse_args()
checkpoint_dir = Path("checkpoints") / args.ckpt_dir
os.makedirs(checkpoint_dir, exist_ok=True)


usage: ipykernel_launcher.py [-h] --ckpt_dir CKPT_DIR
ipykernel_launcher.py: error: the following arguments are required: --ckpt_dir


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [25]:
# Some Chinese punctuations will be tokenized as [UNK], so we replace them with English ones
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [26]:

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")

In [27]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "SemEvalWorkshop/sem_eval_2014_task_1",  # <-- THIS IS THE FIX
            trust_remote_code=True, split=split, cache_dir="./cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Replace Chinese punctuations with English ones
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:100]
# print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")
# print(f"example: {data_sample}")
for ds in data_sample:
    if ds['entailment_judgment'] == 2:
        print(ds)

{'sentence_pair_id': 42, 'premise': 'Two people are kickboxing and spectators are not watching', 'hypothesis': 'Two people are kickboxing and spectators are watching', 'relatedness_score': 3.4000000953674316, 'entailment_judgment': 2}
{'sentence_pair_id': 88, 'premise': 'There is no biker jumping in the air', 'hypothesis': 'A lone biker is jumping in the air', 'relatedness_score': 4.199999809265137, 'entailment_judgment': 2}
{'sentence_pair_id': 90, 'premise': 'A man is jumping into an empty pool', 'hypothesis': 'A man is jumping into a full pool', 'relatedness_score': 3.0, 'entailment_judgment': 2}
{'sentence_pair_id': 122, 'premise': 'Five kids are standing close together and one kid has a gun', 'hypothesis': 'Five kids are standing close together and none of the kids has a gun', 'relatedness_score': 3.700000047683716, 'entailment_judgment': 2}
{'sentence_pair_id': 141, 'premise': 'A group of friends are riding the current in a raft', 'hypothesis': 'A group is not riding the current 

In [None]:
# Define the hyperparameters
# You can modify these values if needed
lr = 1e-4 # train from scratch 先試試看比較大的
weight_decay=0.01 # train from scratch 先試比較大
epochs = 50
train_batch_size = 8
validation_batch_size = 8

In [29]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokenize and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    premise = []
    hypothesis = []
    relatedness_score = []
    entail_judge = []
    for data in batch:
        premise.append(data["premise"])
        hypothesis.append(data["hypothesis"])
        relatedness_score.append(data["relatedness_score"])
        entail_judge.append(data["entailment_judgment"])
    input_batch = tokenizer(premise, 
                            hypothesis,
                            padding=True,
                            truncation=True,
                            return_tensors="pt",
                            max_length=512 
                            )
    input_batch["relatedness_score"] = torch.tensor(relatedness_score, dtype=torch.float)
    input_batch["entailment_judgment"] = torch.tensor(entail_judge, dtype=torch.long)
    
    return input_batch

# TODO1-2: Define your DataLoader
train_dataset = SemevalDataset(split="train")
val_dataset = SemevalDataset(split="validation")
test_dataset = SemevalDataset(split="test")
dl_train = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
dl_validation = DataLoader(val_dataset, batch_size=validation_batch_size, shuffle=False, collate_fn=collate_fn)
dl_test = DataLoader(test_dataset, batch_size=validation_batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # config = BertConfig.from_pretrained("google-bert/bert-base-uncased")
        self.bert = BertModel.from_pretrained("google-bert/bert-base-uncased")
        self.hidden_layers = self.bert.config.hidden_size
        
        self.classification_head = torch.nn.Linear(self.hidden_layers, 3)
        self.regression_head = torch.nn.Linear(self.hidden_layers, 1)
        
        # Write your code here
        # Define what modules you will use in the model
        # Please use "google-bert/bert-base-uncased" model (https://huggingface.co/google-bert/bert-base-uncased)
        # Besides the base model, you may design additional architectures by incorporating linear layers, activation functions, or other neural components.
        # Remark: The use of any additional pretrained language models is not permitted.
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
        # Write your code here
        x = self.bert(input_ids, attention_mask, token_type_ids)
        cls_token = x.pooler_output
        
        classification_logits = self.classification_head(cls_token)
        regression_score = self.regression_head(cls_token)
        return classification_logits, regression_score
        

In [None]:
# TODO3: Define your optimizer and loss function

model = MultiLabelModel().to(device)
# TODO3-1: Define your Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
# scheduler
num_training_steps = len(dl_train)
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps, num_training_steps=num_training_steps)
# TODO3-2: Define your loss functions (you should have two)
# Write your code here
loss_classification = torch.nn.CrossEntropyLoss()
loss_regression = torch.nn.MSELoss()
regression_weight = 2
# scoring functions
psr = load("pearsonr")
acc = load("accuracy")

In [None]:
best_score = 0.0
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    for batch in pbar:
        device_batch = {k:v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        classification_logits, regression_score = model.forward(**device_batch)
        label_reg = device_batch["relatedness_score"]
        label_cls = device_batch["entailment_judgment"]

        loss_cls = loss_classification(classification_logits, label_cls)
        loss_reg = loss_regression(regression_score, label_reg)
        
        total_loss = loss_cls + loss_reg * regression_weight
        
        total_loss.backward()
        optimizer.step()
        scheduler.step()
        pbar.set_postfix({'loss': total_loss.item()})


    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (PearsonCorr, Accuracy)
    all_preds_cls = []
    all_labels_cls = []
    all_preds_reg = []
    all_labels_reg = []
    with torch.no_grad():
        for batch in pbar:
            device_batch = {k:v.to(device) for k, v in batch.items()}
            classification_logits, regression_score = model.forward(**device_batch)
            label_cls = device_batch["entailment_judgment"]
            label_reg = device_batch["relatedness_score"]
            preds_cls = torch.argmax(classification_logits, dim=1)
            
            all_preds_cls.append(preds_cls.cpu())
            all_labels_cls.append(label_cls.cpu())
            all_preds_reg.append(regression_score.cpu())
            all_labels_reg.append(label_reg.cpu())
        
    all_preds_cls = torch.cat(all_preds_cls)
    all_labels_cls = torch.cat(all_labels_cls)
    all_preds_reg = torch.cat(all_preds_reg).squeeze() # Squeeze to 1D
    all_labels_reg = torch.cat(all_labels_reg)

    accuracy = acc.compute(predictions=all_preds_cls, references=all_labels_cls)
    pearson_corr = psr.compute(predictions=all_preds_reg, references=all_labels_reg)
    accuracy_value = accuracy['accuracy']
    pearson_value = pearson_corr['pearsonr']
            # print(f"F1 Score: {f1.compute()}")
    print(f"Epoch {ep}: validation accuracy: {accuracy_value}, pearson_value: {pearson_value} ")
        
    if pearson_value + accuracy_value > best_score:
        best_score = pearson_value + accuracy_value
        torch.save(model.state_dict(), f'{checkpoint_dir}/best_model.ckpt')
        print(f"New best score: {best_score:.4f}. Model saved.")

  return F.mse_loss(input, target, reduction=self.reduction)
Training epoch [1/3]:  81%|████████  | 454/563 [00:28<00:06, 15.81it/s, loss=1.99] 


KeyboardInterrupt: 

In [None]:
# Load the model
model = MultiLabelModel().to(device)
model.load_state_dict(torch.load(f"./saved_models/best_model.ckpt", weights_only=True))

# Test Loop
pbar = tqdm(dl_test, desc="Test")
model.eval()

# TODO6: Write the test loop
# Write your code here
# We have loaded the best model with the highest evaluation score for you
# Please implement the test loop to evaluate the model on the test dataset
# We will have 10% of the total score for the test accuracy and pearson correlation
all_preds_cls = []
all_labels_cls = []
all_preds_reg = []
all_labels_reg = []
with torch.no_grad():
    for batch in pbar:
        device_batch = {k:v.to(device) for k, v in batch.items()}
        classification_logits, regression_score = model.forward(**device_batch)
        label_cls = device_batch["entailment_judgment"]
        label_reg = device_batch["relatedness_score"]
        preds_cls = torch.argmax(classification_logits, dim=1)
        
        all_preds_cls.append(preds_cls.cpu())
        all_labels_cls.append(label_cls.cpu())
        all_preds_reg.append(regression_score.cpu())
        all_labels_reg.append(label_reg.cpu())
    
all_preds_cls = torch.cat(all_preds_cls)
all_labels_cls = torch.cat(all_labels_cls)
all_preds_reg = torch.cat(all_preds_reg).squeeze() # Squeeze to 1D
all_labels_reg = torch.cat(all_labels_reg)

accuracy = acc.compute(predictions=all_preds_cls, references=all_labels_cls)
pearson_corr = psr.compute(predictions=all_preds_reg, references=all_labels_reg)

print(f"test accuracy: {accuracy}")
print(f"test pearson_corr: {pearson_corr}")