In [7]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import random
import string

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, DataCollatorWithPadding, AdamW, get_scheduler, MarianMTModel, MarianTokenizer
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from IPython.display import clear_output
from torch.nn.utils.rnn import pad_sequence
import gc

In [None]:
# # load tu file hue va seoud-label
# hue_dataset = load_dataset("csv", data_files="../data/hue-label.csv")
# hue_dataset['train'] = hue_dataset["train"].filter(lambda x: x["labels"] is not None)
# seoul_dataset = load_dataset("csv", data_files="../data/seoul-label.csv")
# seoul_dataset['train'] = seoul_dataset["train"].filter(lambda x: x["labels"] is not None)
# # drop cot
# hue_dataset = hue_dataset.remove_columns(["positive_score", "negative_score", "neutral_score", "vn_text"])
# seoul_dataset = seoul_dataset.remove_columns(["vn_text", "confidence"])
# # gop 2 datasets vao datasets
# raw_dataset = DatasetDict({'train': concatenate_datasets([hue_dataset['train'], seoul_dataset['train']])})
# # loai bo "en: " o cot ex_text
# def remove_en(example):
#     return {"en_text": example["en_text"].replace("en: ", "")}
# raw_dataset = raw_dataset.map(remove_en)
# # them aspect all vao cot labels
# def add_all(example):
#     return {"labels": 'all-' + example["labels"]}
# raw_dataset = raw_dataset.map(add_all)
# # chia ra train va test
# raw_dataset = raw_dataset['train'].train_test_split(test_size=0.2, shuffle=True)
# # luu files
# raw_dataset["train"].to_csv('../data/raw_dataset/train.csv')
# raw_dataset["test"].to_csv('../data/raw_dataset/test.csv')

load and process data

In [8]:
data_files = {
    "train": "../data/raw_dataset/train.csv",
    "test": "../data/raw_dataset/test.csv"
}
raw_dataset = load_dataset("csv", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Tang cuong va can bang du lieu

In [None]:
# Sá»­ dá»¥ng thiáº¿t bá»‹ (GPU náº¿u cÃ³ sáºµn, náº¿u khÃ´ng thÃ¬ sá»­ dá»¥ng CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Danh sÃ¡ch cÃ¡c ngÃ´n ngá»¯
languages = ['de', 'es', 'ru', 'fr']
translate_models = {}
translate_tokenizers = {}
# Táº£i mÃ´ hÃ¬nh vÃ  tokenizer, sau Ä‘Ã³ chuyá»ƒn mÃ´ hÃ¬nh sang thiáº¿t bá»‹
for language in languages:
    translate_models['to' + language] = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-" + language).to(device)
    translate_tokenizers['to' + language] = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-" + language)
    translate_models['from' + language] = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-" + language + "-en").to(device)
    translate_tokenizers['from' + language] = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-" + language + "-en")
languages.append('en')

In [9]:
# HÃ m back-translate
def back_translate(texts):
    language = random.choice(languages)
    if language == 'en':
        return texts
    # Dá»‹ch tá»« English sang ngÃ´n ngá»¯ Ä‘Ã­ch
    model = translate_models['to' + language]
    tokenizer = translate_tokenizers['to' + language]
    with torch.no_grad():
        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(device)
        translated = model.generate(**inputs)
    texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    # Giáº£i phÃ³ng bá»™ nhá»› GPU cho model vÃ  inputs
    del inputs
    del translated
    torch.cuda.empty_cache()
    # Dá»‹ch tá»« ngÃ´n ngá»¯ Ä‘Ã­ch quay láº¡i English
    model = translate_models['from' + language]
    tokenizer = translate_tokenizers['from' + language]
    with torch.no_grad():
        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(device)
        translated = model.generate(**inputs)
    texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    # Giáº£i phÃ³ng bá»™ nhá»› GPU cho model vÃ  inputs
    del inputs
    del translated
    torch.cuda.empty_cache()
    del model
    del tokenizer
    del language
    gc.collect()
    return texts
# loai bo aspect cua cau sau
def remove_aspect(text):
    text = text.replace("all: ", "")
    text = text.replace("amenities: ", "")
    text = text.replace("cultural heritage: ", "")
    text = text.replace("people: ", "")
    text = text.replace("management: ", "")
    text = text.replace("nature: ", "")
    return text
# loai bo dau cau cua cau truoc
def remove_last_punctuation(text):
    while text[-1] in string.punctuation:
        text = text[:-1]
    return text

def create_train_dataset(raw_datasets, batch_size=48, use_back_translate=True):
    datasets = DatasetDict({'train': raw_datasets['train'].shuffle()})
    if use_back_translate:
        en_text = datasets['train']['en_text']
        new_en_text = []
        for i in range(0, len(en_text), batch_size):
            new_en_text.extend(back_translate(en_text[i : i + batch_size]))
        datasets['train'] = datasets['train'].remove_columns('en_text').add_column('en_text', new_en_text)
    #################################################################################################
    for name in datasets:
        data_dict = {"en_text": [], "labels": [], "type": []}
        for item in datasets[name]:
            text = item['en_text']
            aspect2label = {'all': 0, 'amn': 0, 'ch': 0, 'ppl': 0, 'mgt': 0, 'nat': 0,}
            labels = item['labels'].split()
            for label in labels:
                try:
                    key, value = label.split('-')
                except:
                    print("Unknown label with text:" + text)
                if(key not in aspect2label or value not in ['0', '1', '2', '3']):
                    raise Exception("Unknown label:", label)
                aspect2label[key] = int(value)
            data_dict["en_text"].append("all: " + text);        data_dict["labels"].append(aspect2label['all']);  data_dict["type"].append('all-' + str(aspect2label['all']))
            data_dict["en_text"].append("amenities: " + text);         data_dict["labels"].append(aspect2label['amn']);  data_dict["type"].append('amn-' + str(aspect2label['amn']))
            data_dict["en_text"].append("cultural heritage: " + text); data_dict["labels"].append(aspect2label['ch']);   data_dict["type"].append('ch-' + str(aspect2label['ch']))
            data_dict["en_text"].append("people: " + text);            data_dict["labels"].append(aspect2label['ppl']);  data_dict["type"].append('ppl-' + str(aspect2label['ppl']))
            data_dict["en_text"].append("management: " + text);        data_dict["labels"].append(aspect2label['mgt']);  data_dict["type"].append('mgt-' + str(aspect2label['mgt']))
            data_dict["en_text"].append("nature: " + text);            data_dict["labels"].append(aspect2label['nat']);  data_dict["type"].append('nat-' + str(aspect2label['nat']))
        datasets[name] = Dataset.from_dict(DatasetDict(data_dict))
    #################################################################################################
    type_counts = {}
    for type_ in datasets['train']['type']:
        if type_ in type_counts:
            type_counts[type_] += 1
        else:
            type_counts[type_] = 1
    max_count = max(type_counts.values())
    add_datasets = {"en_text": [], "labels": [], "type": []}
    #-------------------------------------------------------------------------------------------------
    type_3 = ["all-3", "amn-3", "ch-3", "ppl-3", "mgt-3", "nat-3"]
    for type in type_3:
        tmp_1_datasets = datasets['train'].filter(lambda x: x["type"] == type.replace('3', '1'))
        tmp_2_datasets = datasets['train'].filter(lambda x: x["type"] == type.replace('3', '2'))
        tmp_3_datasets = datasets['train'].filter(lambda x: x["type"] == type)
        num_rows_1 = tmp_1_datasets.num_rows
        num_rows_2 = tmp_2_datasets.num_rows
        num_rows_3 = tmp_3_datasets.num_rows
        for i in range(num_rows_3, max_count, 2):
            # 3 + any
            first_sentence = tmp_3_datasets[random.randint(0, num_rows_3 - 1)]["en_text"]
            second_sentence = datasets['train'][random.randint(0, datasets['train'].num_rows - 1)]["en_text"]
            if random.randint(0, 1):
                add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
                add_datasets['labels'].append(3)
                add_datasets['type'].append(type)
            else:
                add_datasets['en_text'].append(remove_last_punctuation(second_sentence) + ", " + remove_aspect(first_sentence))
                add_datasets['labels'].append(3)
                add_datasets['type'].append(type)
            # 1 + 2
            first_sentence = tmp_1_datasets[random.randint(0, num_rows_1 - 1)]["en_text"]
            second_sentence = tmp_2_datasets[random.randint(0, num_rows_2 - 1)]["en_text"]
            if random.randint(0, 1):
                add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
                add_datasets['labels'].append(3)
                add_datasets['type'].append(type)
            else:
                add_datasets['en_text'].append(remove_last_punctuation(second_sentence) + ", " + remove_aspect(first_sentence))
                add_datasets['labels'].append(3)
                add_datasets['type'].append(type)
    #-------------------------------------------------------------------------------------------------
    type_0 = ["all-0", "amn-0", "ch-0", "ppl-0", "mgt-0", "nat-0"]
    for type in type_0:
        tmp_datasets = datasets['train'].filter(lambda x: x["type"] == type)
        num_rows = tmp_datasets.num_rows
        for i in range(num_rows, max_count):
            # 0 + 0
            first_sentence = tmp_datasets[random.randint(0, num_rows - 1)]["en_text"]
            second_sentence = tmp_datasets[random.randint(0, num_rows - 1)]["en_text"]
            add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
            add_datasets['labels'].append(0)
            add_datasets['type'].append(type)
    #-------------------------------------------------------------------------------------------------
    type_1 = ["all-1", "amn-1", "ch-1", "ppl-1", "mgt-1", "nat-1"]
    for type in type_1:
        tmp_0_datasets = datasets['train'].filter(lambda x: x["type"] == type.replace('1', '0'))
        tmp_1_datasets = datasets['train'].filter(lambda x: x["type"] == type)
        num_rows_0 = tmp_0_datasets.num_rows
        num_rows_1 = tmp_1_datasets.num_rows
        for i in range(num_rows_1, max_count, 2):
            # 0 + 1
            first_sentence = tmp_0_datasets[random.randint(0, num_rows_0 - 1)]["en_text"]
            second_sentence = tmp_1_datasets[random.randint(0, num_rows_1 - 1)]["en_text"]
            if random.randint(0, 1):
                add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
                add_datasets['labels'].append(1)
                add_datasets['type'].append(type)
            else:
                add_datasets['en_text'].append(remove_last_punctuation(second_sentence) + ", " + remove_aspect(first_sentence))
                add_datasets['labels'].append(1)
                add_datasets['type'].append(type)
            # 1 + 1
            first_sentence = tmp_1_datasets[random.randint(0, num_rows_1 - 1)]["en_text"]
            second_sentence = tmp_1_datasets[random.randint(0, num_rows_1 - 1)]["en_text"]
            add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
            add_datasets['labels'].append(1)
            add_datasets['type'].append(type)
    #-------------------------------------------------------------------------------------------------
    type_2 = ["all-2", "amn-2", "ch-2", "ppl-2", "mgt-2", "nat-2"]
    for type in type_2:
        tmp_0_datasets = datasets['train'].filter(lambda x: x["type"] == type.replace('2', '0'))
        tmp_2_datasets = datasets['train'].filter(lambda x: x["type"] == type)
        num_rows_0 = tmp_0_datasets.num_rows
        num_rows_2 = tmp_2_datasets.num_rows
        for i in range(num_rows_2, max_count, 2):
            # 0 + 2
            first_sentence = tmp_0_datasets[random.randint(0, num_rows_0 - 1)]["en_text"]
            second_sentence = tmp_2_datasets[random.randint(0, num_rows_2 - 1)]["en_text"]
            if random.randint(0, 1):
                add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
                add_datasets['labels'].append(2)
                add_datasets['type'].append(type)
            else:
                add_datasets['en_text'].append(remove_last_punctuation(second_sentence) + ", " + remove_aspect(first_sentence))
                add_datasets['labels'].append(2)
                add_datasets['type'].append(type)
            # 2 + 2
            first_sentence = tmp_2_datasets[random.randint(0, num_rows_2 - 1)]["en_text"]
            second_sentence = tmp_2_datasets[random.randint(0, num_rows_2 - 1)]["en_text"]
            add_datasets['en_text'].append(remove_last_punctuation(first_sentence) + ", " + remove_aspect(second_sentence))
            add_datasets['labels'].append(2)
            add_datasets['type'].append(type)
    clear_output()
    return DatasetDict({"train": concatenate_datasets([datasets["train"], Dataset.from_dict(DatasetDict(add_datasets))])})

def create_test_dataset(raw_datasets):
    datasets = DatasetDict({
        'train': raw_datasets['train'],
        'test': raw_datasets['test']})
    for name in datasets:
        data_dict = {"en_text": [], "labels": [], "type": []}
        for item in datasets[name]:
            text = item['en_text']
            aspect2label = {'all': 0, 'amn': 0, 'ch': 0, 'ppl': 0, 'mgt': 0, 'nat': 0,}
            labels = item['labels'].split()
            for label in labels:
                try:
                    key, value = label.split('-')
                except:
                    print("Unknown label with text:" + text)
                if(key not in aspect2label or value not in ['0', '1', '2', '3']):
                    raise Exception("Unknown label:", label)
                aspect2label[key] = int(value)
            data_dict["en_text"].append("all: " + text);        data_dict["labels"].append(aspect2label['all']);  data_dict["type"].append('all-' + str(aspect2label['all']))
            data_dict["en_text"].append("amenities: " + text);         data_dict["labels"].append(aspect2label['amn']);  data_dict["type"].append('amn-' + str(aspect2label['amn']))
            data_dict["en_text"].append("cultural heritage: " + text); data_dict["labels"].append(aspect2label['ch']);   data_dict["type"].append('ch-' + str(aspect2label['ch']))
            data_dict["en_text"].append("people: " + text);            data_dict["labels"].append(aspect2label['ppl']);  data_dict["type"].append('ppl-' + str(aspect2label['ppl']))
            data_dict["en_text"].append("management: " + text);        data_dict["labels"].append(aspect2label['mgt']);  data_dict["type"].append('mgt-' + str(aspect2label['mgt']))
            data_dict["en_text"].append("nature: " + text);            data_dict["labels"].append(aspect2label['nat']);  data_dict["type"].append('nat-' + str(aspect2label['nat']))
        datasets[name] = Dataset.from_dict(DatasetDict(data_dict))
    return datasets

load model

In [4]:
# upload model to huggingface
# !git config --global user.email "khanh.tq2802@outlook.com"
# !git config --global user.name "khanhtq2802"
# from huggingface_hub import notebook_login
# notebook_login()
tokenizer.push_to_hub("thesis-model")
config.push_to_hub("thesis-model")
model.push_to_hub("thesis-model")
# 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

CommitInfo(commit_url='https://huggingface.co/khanhtq2802/thesis-model/commit/3dca0f5b1458d79e2e256f34b1f3755b2cae55b6', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='3dca0f5b1458d79e2e256f34b1f3755b2cae55b6', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
# load model, set freeze, set device
tokenizer = AutoTokenizer.from_pretrained("khanhtq2802/thesis-model")
config = AutoConfig.from_pretrained("khanhtq2802/thesis-model")
# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest", num_labels=4, ignore_mismatched_sizes=True)
# model = AutoModelForSequenceClassification.from_pretrained("khanhtq2802/thesis-model", force_download=True)
# model = AutoModelForSequenceClassification.from_pretrained("../best_weights")
# model = AutoModelForSequenceClassification.from_pretrained("../late_weights")
model = AutoModelForSequenceClassification.from_pretrained("../mlm_backtranslate")
# Freeze all layers
for name, param in model.named_parameters():
    param.requires_grad = False
# unFreeze
for name, param in model.named_parameters():
    if name.startswith("classifier.out_proj"):
        param.requires_grad = True
for name, param in model.named_parameters():
    if name.startswith("classifier.dense"):
        param.requires_grad = True
for name, param in model.named_parameters():
    if name.startswith("roberta.encoder.layer.11"):
        param.requires_grad = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable Parameters:", trainable_params)



config.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

cuda
Trainable Parameters: 7681540


In [None]:
model

In [11]:
# Danh sÃ¡ch cÃ¡c aspect
aspects = ["all: ", "amenities: ", "cultural heritage: ", "people: ", "management: ", "nature: ",]
skip_tokens = 0
# Chuyá»ƒn cÃ¡c aspect sang token vÃ  Ä‘áº¿m sá»‘ lÆ°á»£ng token
for aspect in aspects:
    tokens = tokenizer.tokenize(aspect)
    skip_tokens = max(skip_tokens, len(tokens))
print(skip_tokens)
class CustomDataCollator:
    def __init__(self, tokenizer, mlm_probability=0.15, skip_tokens=4):
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability
        self.skip_tokens = skip_tokens
    def __call__(self, examples):
        input_ids = [example['input_ids'] for example in examples]
        attention_mask = [example['attention_mask'] for example in examples]
        labels = [example['labels'] for example in examples]
        # Mask tokens with probability self.mlm_probability
        for i in range(len(input_ids)):
            for j in range(self.skip_tokens + 1, len(input_ids[i])):
                if random.random() < self.mlm_probability:
                    input_ids[i][j] = self.tokenizer.mask_token_id
        # Convert input_ids to tensor
        input_ids = pad_sequence([torch.tensor(sublist) for sublist in input_ids], batch_first=True, padding_value=1)
        # Pad attention_mask and convert to tensor
        attention_mask = pad_sequence([torch.tensor(sublist) for sublist in attention_mask], batch_first=True, padding_value=1)
        return {
            'labels': torch.tensor(labels),
            'input_ids': input_ids,
            'attention_mask': attention_mask,}
def tokenize_function(example):
    return tokenizer(example["en_text"], truncation=True)
data_collator_train = CustomDataCollator(
    tokenizer=tokenizer,
    mlm_probability=0.15,  # 15% masking rate
    skip_tokens=skip_tokens
)
data_collator_test = DataCollatorWithPadding(tokenizer=tokenizer)

4


In [12]:
# load loss history from file
training_losses = []
test_losses = []
try:
    with open("losses.txt", "r") as f:
        for line in f:
            if line.strip():  # Check if line is not empty
                training_loss, test_loss = line.split(",")
                training_losses.append(float(training_loss))
                test_losses.append(float(test_loss))
except:
    print("error when load training history")

train

In [None]:
test_dataloader = DataLoader(
    create_test_dataset(raw_dataset).map(tokenize_function, batched=True).remove_columns(['en_text', 'type'])['test'], 
    shuffle=False, 
    batch_size=24, 
    collate_fn=data_collator_test)
batch_size = 12
train_dataloader = DataLoader(
    create_train_dataset(raw_dataset, use_back_translate=True).map(tokenize_function, batched=True).remove_columns(["en_text", "type"])["train"], 
    shuffle=True, 
    batch_size=batch_size, 
    collate_fn=data_collator_train)

optimizer = AdamW(model.parameters(), lr=3e-5) #before 1e-5
num_epochs = 100
num_training_steps = num_epochs * len(train_dataloader)
decay = "cosine" #constant cosine linear

lr_scheduler = get_scheduler(
    decay,
    optimizer=optimizer,
    num_warmup_steps=len(train_dataloader),
    num_training_steps=num_training_steps)
for epoch in range(num_epochs):
    # train
    model.train()
    epoch_losses = []
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        loss = model(**batch).loss
        loss.backward()
        epoch_losses.append(loss.item())
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    training_losses.append(sum(epoch_losses) / len(epoch_losses))
    # evaluation
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            total_loss += torch.nn.CrossEntropyLoss()(model(**{k: v.to(device) for k, v in batch.items() if k != 'labels'}).logits, batch['labels'].to(device)).item()
    test_losses.append(total_loss / len(test_dataloader))
    # Plotting
    clear_output(wait=True)
    plt.plot(training_losses, label="Training Loss")
    plt.plot(test_losses, label="test Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    # save loss history to file
    with open("losses.txt", "w") as f:
        for i in range(len(training_losses)):
            f.write(f"{training_losses[i]},{test_losses[i]}\n")
    # save model
    model.save_pretrained("../late_weights")
    if test_losses[-1] == min(test_losses):
        model.save_pretrained("../best_weights")
    # re-create train_dataloader
    if len(test_losses) >= 2:
        if test_losses[-1] >= test_losses[-2]:
            torch.cuda.empty_cache()
            train_dataloader = DataLoader(
                create_train_dataset(raw_dataset, use_back_translate=True).map(tokenize_function, batched=True).remove_columns(["en_text", "type"])["train"], 
                shuffle=True,
                batch_size=batch_size, 
                collate_fn=data_collator_train)

In [None]:
# model.save_pretrained("../best_weights")

evaluate

In [13]:
def evaluate_model(model, raw_datasets, device, name='train'):
    dataset = create_test_dataset(raw_datasets)

    dataloader = DataLoader(
    dataset.map(tokenize_function, batched=True).remove_columns(['en_text', 'type'])[name], 
    shuffle=False,
    batch_size=16, 
    collate_fn=data_collator_test)

    model.eval()  # Set the model to evaluation mode
    total_acc, total_loss = 0, 0
    fail_count = {
        "false-positive": 
        {"all-0": 0, "amn-0": 0, "ch-0": 0, "mgt-0": 0, "nat-0": 0, "ppl-0": 0,
        "all-1": 0, "amn-1": 0, "ch-1": 0, "mgt-1": 0, "nat-1": 0, "ppl-1": 0,
        "all-2": 0, "amn-2": 0, "ch-2": 0, "mgt-2": 0, "nat-2": 0, "ppl-2": 0,
        "all-3": 0, "amn-3": 0, "ch-3": 0, "mgt-3": 0, "nat-3": 0, "ppl-3": 0, },

        "false-negative": 
        {"all-0": 0, "amn-0": 0, "ch-0": 0, "mgt-0": 0, "nat-0": 0, "ppl-0": 0,
        "all-1": 0, "amn-1": 0, "ch-1": 0, "mgt-1": 0, "nat-1": 0, "ppl-1": 0,
        "all-2": 0, "amn-2": 0, "ch-2": 0, "mgt-2": 0, "nat-2": 0, "ppl-2": 0,
        "all-3": 0, "amn-3": 0, "ch-3": 0, "mgt-3": 0, "nat-3": 0, "ppl-3": 0, },
    }
    type_counts = {}
    for type_ in dataset[name]['type']:
        if type_ in type_counts:
            type_counts[type_] += 1
        else:
            type_counts[type_] = 1

    with torch.no_grad():
        for batch in dataloader:
            # Move data to the specified device
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            logits = outputs.logits  # Access the model's output logits

            # Calculate loss (optional, for reference)
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            # Calculate accuracy
            pred_labels = torch.argmax(logits, dim=1)  # Get predicted labels
            match_labels = pred_labels == labels
            for i in range(len(match_labels)):
                if match_labels[i] == True:
                    total_acc+=1
                else:
                    text = tokenizer.decode(batch['input_ids'][i])
                    if text.startswith("<s>all: "):
                        fail_count["false-positive"]["all-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["all-" + str(labels[i].item())] += 1
                    if text.startswith("<s>amenities: "):
                        fail_count["false-positive"]["amn-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["amn-" + str(labels[i].item())] += 1
                    if text.startswith("<s>cultural heritage: "):
                        fail_count["false-positive"]["ch-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["ch-" + str(labels[i].item())] += 1
                    if text.startswith("<s>history: "):
                        fail_count["false-positive"]["hist-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["hist-" + str(labels[i].item())] += 1
                    if text.startswith("<s>people: "):
                        fail_count["false-positive"]["ppl-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["ppl-" + str(labels[i].item())] += 1
                    if text.startswith("<s>management: "):
                        fail_count["false-positive"]["mgt-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["mgt-" + str(labels[i].item())] += 1
                    if text.startswith("<s>nature: "):
                        fail_count["false-positive"]["nat-" + str(pred_labels[i].item())] += 1
                        fail_count["false-negative"]["nat-" + str(labels[i].item())] += 1
                    print(text)
                    print("pred: ", pred_labels[i])
                    print("labels: ", labels[i])
                    print()
    recalls = []; precisions = []; f1s = []
    # recall, precision, f1
    for key in type_counts:
        if type_counts[key] > 0:
            true_positive = type_counts[key] - fail_count["false-negative"][key]
            recall = 0; precision = 0; f1 = 0
            if true_positive != 0:
                recall = true_positive/type_counts[key]
                precision = true_positive/(true_positive + fail_count["false-positive"][key])
                f1 = 2*recall*precision/(recall + precision)
            print(key, "recall=", round(recall, 4), "precision=", round(precision, 4), "f1=", round(f1, 4))
            recalls.append(recall); precisions.append(precision); f1s.append(f1)
    # Accuracy
    print(total_acc, dataloader.dataset.num_rows)
    print("Accuracy:", round(total_acc / dataloader.dataset.num_rows, 5))
    # Loss
    print("Loss:", total_loss / len(dataloader))
    # Marco recall, precision, f1
    print("Marco-recall:", round(sum(recalls)/len(recalls), 5))
    print("Marco-precision:", round(sum(precisions)/len(precisions), 5))
    print("Marco-f1:", round(sum(f1s)/len(f1s), 5))

In [14]:
evaluate_model(model, raw_dataset, device, name='test')

Map:   0%|          | 0/17352 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4344 [00:00<?, ? examples/s]

<s>all: We learned that the first emperor didn't like western women because they had ugly teeth (Vietnamese women dyed their teeth black, which was more to his liking.</s><pad>
pred:  tensor(2, device='cuda:0')
labels:  tensor(1, device='cuda:0')

<s>cultural heritage: We learned that the first emperor didn't like western women because they had ugly teeth (Vietnamese women dyed their teeth black, which was more to his liking.</s>
pred:  tensor(0, device='cuda:0')
labels:  tensor(1, device='cuda:0')

<s>amenities: The English secret garden tour at 130pm was great.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
pred:  tensor(0, device='cuda:0')
labels:  tensor(1, device='cuda:0')

<s>amenities: And I did not think I saw any timetable for the secret garden at the main entrance, may be it is a little 