In [1]:
from transformers import (
    set_seed,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#!pip install torch datasets seqeval

In [3]:
set_seed(123)

In [4]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.is_available(), DEVICE

(True, 'cuda')

In [5]:


from datasets import load_dataset

data_files = "./train_jp.json"
datasets = load_dataset("json", data_files=data_files, split={
    "train": "train[:80%]",
    "validation": "train[80%:90%]",
    "test": "train[90%:]"
})

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 6707
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 745
    })
})


In [6]:
column_names = datasets["train"].column_names
features = datasets["train"].features

column_names, features

(['tokens', 'tags'],
 {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
  'tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)})

In [7]:
text_column_name = "tokens" if "tokens" in column_names else column_names[0]
label_column_name = (
    f"pos_tags" if f"pos_tags" in column_names else column_names[1]
)

text_column_name, label_column_name

('tokens', 'tags')

In [8]:
from datasets import ClassLabel

isinstance(features[label_column_name].feature, ClassLabel)

False

In [9]:
# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

label_list = get_label_list(datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list)

label_list, label_to_id

(['B', 'I', 'O'], {'B': 0, 'I': 1, 'O': 2})

In [10]:
from accelerate import Accelerator

# Initialize accelerator
accelerator = Accelerator()

accelerator.device

device(type='cuda')

In [11]:
from transformers import AutoConfig, BertTokenizerFast, BertForTokenClassification


model_name_or_path = "agne/jobGBERT"

config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="pos",
)
tokenizer = BertTokenizerFast.from_pretrained(
    model_name_or_path,
)
model = BertForTokenClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Some weights of the model checkpoint at agne/jobGBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at agne/jobGBERT and are newly initialized: ['

In [12]:
padding = "max_length" # False

In [13]:


# Whether to put the label for one word on all tokens of generated by that word or just on the ""one (in which case the other tokens will have a padding index).
label_all_tokens = False

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


#datasets.set_format("torch", columns=["input_ids", "attention_mask"])
datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=None,
    load_from_cache_file=None,
)

train_dataset = datasets["train"]
#train_dataset = train_dataset.map(
#    tokenize_and_align_labels,
#    batched=True,
#    num_proc=None,
#    load_from_cache_file=None,
#)


eval_dataset = datasets["validation"]
#eval_dataset = eval_dataset.map(
#    tokenize_and_align_labels,
#    batched=True,
#    num_proc=None,
#    load_from_cache_file=None,
#)


In [14]:
from transformers import DataCollatorForTokenClassification

fp16 = False

data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if fp16 else None)



In [15]:
# Metrics
from datasets import load_metric
metric = load_metric("./my_seqeval.py")

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        with open("metrics.json", "w") as f:
            json.dump(computed_metrics, f)

        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }



  metric = load_metric("./my_seqeval.py")


In [16]:
import os
from transformers.trainer_utils import get_last_checkpoint

output_dir = "./out_new3"
overwrite_output_dir = False

last_checkpoint = None
if os.path.isdir(output_dir) and not overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(output_dir)


In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(do_train=True, do_eval=True,
                                  num_train_epochs=10, per_device_train_batch_size=8, per_gpu_eval_batch_size=8, save_steps=1000,
                                  seed=123, output_dir=output_dir)

model.to(accelerator.device)
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
#import torch
#torch.cuda.empty_cache()

In [None]:
torch.cuda.get_device_name(0)

In [18]:
if last_checkpoint is not None:
    checkpoint = last_checkpoint
elif os.path.isdir(model_name_or_path):
    checkpoint = model_name_or_path
else:
    checkpoint = None
    
train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics
trainer.save_model()  # Saves the tokenizer too for easy upload

metrics["train_samples"] = len(train_dataset)

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Loading model from ./out_new3\checkpoint-2000).
The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags.
***** Running training *****
  Num examples = 6707
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8390
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 2000
  Will skip the first 2 epochs then the first 322 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.
Skipping the first batches: 100%|████████████████████████████████████████████████████| 322/322 [00:17<00:00, 18.42it/s]


Step,Training Loss
2500,0.2241
3000,0.1762
3500,0.1831
4000,0.1366
4500,0.1149
5000,0.0973
5500,0.0758
6000,0.0628
6500,0.0559
7000,0.0507


Saving model checkpoint to ./out_new3\checkpoint-3000
Configuration saved in ./out_new3\checkpoint-3000\config.json
Model weights saved in ./out_new3\checkpoint-3000\pytorch_model.bin
tokenizer config file saved in ./out_new3\checkpoint-3000\tokenizer_config.json
Special tokens file saved in ./out_new3\checkpoint-3000\special_tokens_map.json
Saving model checkpoint to ./out_new3\checkpoint-4000
Configuration saved in ./out_new3\checkpoint-4000\config.json
Model weights saved in ./out_new3\checkpoint-4000\pytorch_model.bin
tokenizer config file saved in ./out_new3\checkpoint-4000\tokenizer_config.json
Special tokens file saved in ./out_new3\checkpoint-4000\special_tokens_map.json
Saving model checkpoint to ./out_new3\checkpoint-5000
Configuration saved in ./out_new3\checkpoint-5000\config.json
Model weights saved in ./out_new3\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in ./out_new3\checkpoint-5000\tokenizer_config.json
Special tokens file saved in ./out_new3\checkpoi

***** train metrics *****
  epoch                    =       10.0
  total_flos               = 16321737GF
  train_loss               =      0.076
  train_runtime            = 6:17:25.09
  train_samples            =       6707
  train_samples_per_second =      2.962
  train_steps_per_second   =       0.37
