In [64]:
import json
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score
from transformers import TrainingArguments, Trainer
import shutil
import os

In [65]:
with open("tag.txt", "r") as f:
    base_labels = [line.strip() for line in f.readlines() if line.strip()]

label_list = ["O"]
for label in base_labels:
    label_list.append(f"B-{label}")
    label_list.append(f"I-{label}")

label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

with open("combined_data.json", "r", encoding="utf-8") as f:
    training_data = json.load(f)

dataset = Dataset.from_list(training_data)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

label2id: {'O': 0, 'B-PATIENT': 1, 'I-PATIENT': 2, 'B-DOCTOR': 3, 'I-DOCTOR': 4, 'B-USERNAME': 5, 'I-USERNAME': 6, 'B-PERSONALNAME': 7, 'I-PERSONALNAME': 8, 'B-FAMILYNAME': 9, 'I-FAMILYNAME': 10, 'B-HOSPITAL': 11, 'I-HOSPITAL': 12, 'B-DEPARTMENT': 13, 'I-DEPARTMENT': 14, 'B-ROOM': 15, 'I-ROOM': 16, 'B-STREET': 17, 'I-STREET': 18, 'B-CITY': 19, 'I-CITY': 20, 'B-DISTRICT': 21, 'I-DISTRICT': 22, 'B-COUNTY': 23, 'I-COUNTY': 24, 'B-STATE': 25, 'I-STATE': 26, 'B-COUNTRY': 27, 'I-COUNTRY': 28, 'B-ZIP': 29, 'I-ZIP': 30, 'B-ORGANIZATION': 31, 'I-ORGANIZATION': 32, 'B-LOCATION-OTHER': 33, 'I-LOCATION-OTHER': 34, 'B-AGE': 35, 'I-AGE': 36, 'B-DATE': 37, 'I-DATE': 38, 'B-TIME': 39, 'I-TIME': 40, 'B-DURATION': 41, 'I-DURATION': 42, 'B-SET': 43, 'I-SET': 44, 'B-CONTACT': 45, 'I-CONTACT': 46, 'B-PHONE': 47, 'I-PHONE': 48, 'B-FAX': 49, 'I-FAX': 50, 'B-EMAIL': 51, 'I-EMAIL': 52, 'B-URL': 53, 'I-URL': 54, 'B-IPADDRESS': 55, 'I-IPADDRESS': 56, 'B-SOCIAL_SECURITY_NUMBER': 57, 'I-SOCIAL_SECURITY_NUMBER': 58

In [66]:
model_path = "bert-base-cased"
# model_path = "../model/ner_shi_model"
model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list), id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    for pred, label in zip(predictions, labels):
        true_labels_example = []
        true_predictions_example = []
        for p_val, l_val in zip(pred, label):
            if l_val != -100:
                true_labels_example.append(id2label[l_val])
                true_predictions_example.append(id2label[p_val])
        true_labels.append(true_labels_example)
        true_predictions.append(true_predictions_example)

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }


In [68]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/307 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [69]:
training_args = TrainingArguments(
    output_dir="../model/ner_shi_model_checkpoint",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [70]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.609,0.082575,0.987923,0.0,0.0,0.0
2,0.0575,0.05532,0.986175,0.45614,0.346667,0.393939
3,0.0428,0.046264,0.988241,0.589744,0.306667,0.403509
4,0.035,0.044412,0.987446,0.536585,0.293333,0.37931
5,0.0277,0.042342,0.989194,0.627119,0.493333,0.552239
6,0.0211,0.04422,0.989035,0.59322,0.466667,0.522388
7,0.0212,0.044411,0.987923,0.511628,0.586667,0.546584
8,0.0137,0.047328,0.988877,0.564706,0.64,0.6
9,0.0105,0.047987,0.988718,0.535714,0.6,0.566038
10,0.0095,0.048519,0.988718,0.54878,0.6,0.573248


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=390, training_loss=0.06650670507015326, metrics={'train_runtime': 3696.663, 'train_samples_per_second': 0.83, 'train_steps_per_second': 0.106, 'total_flos': 802710473472000.0, 'train_loss': 0.06650670507015326, 'epoch': 10.0})

In [71]:
model_dir = "../model/ner_shi_model"
if os.path.exists(model_dir):
    shutil.rmtree(model_dir)
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('../model/ner_shi_model\\tokenizer_config.json',
 '../model/ner_shi_model\\special_tokens_map.json',
 '../model/ner_shi_model\\vocab.txt',
 '../model/ner_shi_model\\added_tokens.json',
 '../model/ner_shi_model\\tokenizer.json')