In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import os
import random
import csv
import gc

In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [None]:
path = "."

output_path = path + "/output"
logging_path = path + "/logs"

model_path = path + "../model"
data_path = path + "../dataset/HUE_Named_Entity_Recognition"

train_data_path = data_path + "/HUE_Named_Entity_Recognition_train.csv"
eval_data_path = data_path + "/HUE_Named_Entity_Recognition_dev.csv"
test_data_path = data_path + "/HUE_Named_Entity_Recognition_test.csv"

In [None]:
model_map = {
    "bert-not-pretrained": {
        "tokenizer": "bert-base-multilingual-cased",
        "model": "",
        "output": output_path + "/bert-not-pretrained",
        "logs": logging_path + "/bert-not-pretrained",
    },
    "AnchiBERT": {
        "tokenizer": path + "/model/AnchiBERT", # path to AnchiBERT
        "model": path + "/model/AnchiBERT", # path to AnchiBERT
        "output": output_path + "/AnchiBERT",
        "logs": logging_path + "/AnchiBERT"
    },
    "mBERT": {
        "tokenizer": "bert-base-multilingual-cased",
        "model": "bert-base-multilingual-cased",
        "output": output_path + "/bert-base-multilingual-cased",
        "logs": logging_path + "/bert-base-multilingual-cased"
    },
    "AnchiBERT+AJD-DRS": {
        "tokenizer": model_path + "/AnchiBERT+AJD-DRS",
        "model": model_path + "/AnchiBERT+AJD-DRS",
        "output": output_path + "/AnchiBERT+AJD-DRS",
        "logs": logging_path + "/AnchiBERT+AJD-DRS"
        
    },
    "mBERT+AJD-DRS": {
        "tokenizer": model_path + "/mBERT+AJD-DRS",
        "model": model_path + "/mBERT+AJD-DRS",
        "output": output_path + "/mBERT+AJD-DRS",
        "logs": logging_path + "/mBERT+AJD-DRS"
    }
}

In [None]:
train_data = pd.read_csv(train_data_path, error_bad_lines=False)
eval_data = pd.read_csv(eval_data_path, error_bad_lines=False)
test_data = pd.read_csv(test_data_path, error_bad_lines=False)
train_data, eval_data, test_data = train_data.dropna(), eval_data.dropna(), test_data.dropna()
train_data

In [None]:
def data2feature(data):
    texts = train_data['text_ch'].str.split('').str[1:-1].tolist()
    tags = train_data['tag'].apply(lambda x:list(literal_eval(x))).tolist()
    return texts, tags

In [None]:
train_texts, train_tags = data2feature(train_data)
eval_texts, eval_tags = data2feature(eval_data)
test_texts, test_tags = data2feature(test_data)

tags = train_tags + eval_tags + test_tags

In [None]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels[:min(512, sum((arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)))]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
from datasets import load_metric

def compute_metrics(p):
    metric = load_metric("seqeval")
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    label_list = sorted(list(unique_tags))
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    return results

In [None]:
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, EarlyStoppingCallback, Trainer, TrainingArguments

predict, evaluate = {k: [] for k in model_map.keys()}, {k: [] for k in model_map.keys()}

for model_name, model_dict in model_map.items():
    tokenizer = BertTokenizerFast.from_pretrained(model_dict["tokenizer"])
    
    train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
    eval_encodings = tokenizer(eval_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
    
    train_labels = encode_tags(train_tags, train_encodings)
    eval_labels = encode_tags(eval_tags, eval_encodings)
    test_labels = encode_tags(test_tags, test_encodings)
    
    train_encodings.pop("offset_mapping")
    eval_encodings.pop("offset_mapping")
    test_encodings.pop("offset_mapping")
    
    train_dataset = CustomDataset(train_encodings, train_labels)
    eval_dataset = CustomDataset(eval_encodings, eval_labels)
    test_dataset = CustomDataset(test_encodings, test_labels)
    
    args = TrainingArguments(
        output_dir=model_dict["output"],
        overwrite_output_dir=True,
        logging_dir=model_dict["logs"],
        logging_steps=5000,
        logging_strategy="steps",
        num_train_epochs=30,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=5000,
        weight_decay=0.01,
        load_best_model_at_end=True,
        evaluation_strategy="steps",
        eval_steps=5000,
        save_steps=5000,
        metric_for_best_model="loss",
        do_train=True,
        do_eval=True,
        do_predict=True
    )
    if model_name == "bert-not-pretrained":
        config = BertConfig()
        config.num_labels = len(unique_tags)
        model = BertForTokenClassification(config)
    else:
        model = BertForTokenClassification.from_pretrained(model_dict["model"], num_labels=len(unique_tags))
    trainer = Trainer(
        model=model,
        args=args,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    
    trainer.train()
    evaluate[model_name].append(trainer.evaluate())
    predict[model_name].append(trainer.predict(eval_dataset))
    
    del tokenizer, model, args, trainer
    
    gc.collect()
    torch.cuda.empty_cache()