In [136]:
from transformers import BertTokenizerFast, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification
import numpy as np
import pandas as pd
import ast

In [154]:
data = pd.read_csv('encoded.csv').drop(columns=['Unnamed: 0','Polarities'])

In [156]:
train = data[0:2959]
test = data[2959:2959+370].reset_index()
valid = data[2959+370:].reset_index()

In [157]:
train_ds = datasets.Dataset.from_pandas(train)
test_ds = datasets.Dataset.from_pandas(test)
valid_ds = datasets.Dataset.from_pandas(valid)

In [158]:
dataset = datasets.DatasetDict(
    {
        "train": train_ds,
        "test": test_ds,
        "validation": test_ds
    }
)

In [159]:
model_ckpt = '/karayel/bert-base-turkish-uncased'

In [160]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [167]:
def tokenize_and_align_labels(row):
  tokenized_data = tokenizer(row["Tokens"], truncation = True, is_split_into_words = True)

  labels = []

  for i, label in enumerate(row["Tags"]):
    word_ids = tokenized_data.word_ids(i)


    label_ids=[]
    pre_word_index = None

    for word_index in word_ids:
      if word_index != pre_word_index:
        pre_word_index = word_index
        if word_index is None:
          label_ids.append(-100)
        else:
          label_ids.append(label[word_index])

      elif word_index is None:
        label_ids.append(-100)

      else:
        l = label[word_index]

        if l%2 == 1:
          l = l + 1
        label_ids.append(l)

    labels.append(label_ids)

  tokenized_data["labels"] = labels
  return tokenized_data

In [188]:
def make_list(example):
    example['Tokens'] = ast.literal_eval(example['Tokens'])
    example['Tags'] = ast.literal_eval(example['Tags'])
    return example

dataset = dataset.map(make_list)

Map:   0%|          | 0/2959 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

In [193]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched = True)

Map:   0%|          | 0/2959 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

In [194]:
tokenized_train_ds = tokenized_dataset["train"]
tokenized_test_ds = tokenized_dataset["test"]
tokenized_eval_ds = tokenized_dataset["validation"]

In [195]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [196]:
id2label = {0:'O', 1:'B-ORG', 2:'I-ORG'}

In [197]:
label2id = {'O':0, 'B-ORG':1, 'I-ORG':2}

In [198]:
num_labels=3

In [199]:
model_name = model_ckpt.split("/")[-1] + "for_ner"
output_dir_name = "finetuned_turkish_distilbert_for_ner"
output_tokenizer_name = "bert_tokenizer_ddi"
strategy = "epoch"
learning_rate = 2e-5
per_device_train_batch_size = 16
per_device_eval_batch_size = 16
num_train_epochs = 3
weight_decay = 0.01
metric_for_best_model = "eval_accuracy"

In [200]:
model = AutoModelForTokenClassification.from_pretrained(model_ckpt, num_labels= num_labels, label2id=label2id, id2label=id2label)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /home/camp5/karayel/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [218]:
args = TrainingArguments(
    model_name,
    evaluation_strategy = strategy,
    learning_rate = learning_rate,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    num_train_epochs = num_train_epochs,
    weight_decay = weight_decay,
    logging_strategy = strategy,
    logging_first_step = True,
    disable_tqdm = False,
    report_to = "none",
    metric_for_best_model = metric_for_best_model,
    push_to_hub = False,
    save_safetensors=False
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [219]:
from datasets import load_metric
metrics = datasets.load_metric("seqeval")

In [220]:
def compute_metrics(preds):
  pred_logits, labels = preds
  pred_logits = np.argmax(pred_logits, axis=-1) ## axis

  predicted_labels = []
  true_labels = []

  predicted_labels = [
      [id2label[p_label] for (p_label, t_label) in zip(prediction, label) if t_label != -100] for prediction, label in zip(pred_logits, labels)
  ]

  true_labels = [
      [id2label[t_label] for t_label in label if t_label != -100] for label in labels
  ]

  scores = metrics.compute(predictions = predicted_labels, references = true_labels)

  return {
      "accuracy": scores["overall_accuracy"],
      "f1 score": scores["overall_f1"],
      ##"micro F1": scores["micro_f1"],
      ##"macro F1": scores["macro_f1"],
      "recall": scores["overall_recall"],
      ##"micro Recall": scores["micro_recall"],
      ##"macro Recall": scores["macro_recall"],
      "precision": scores["overall_precision"],
      ##"micro Precision": scores["micro_precision"],
      ##"macro Precision": scores["macro_precision"]
  }


In [222]:
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_train_ds,
    eval_dataset = tokenized_eval_ds,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [223]:
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 score,Recall,Precision
1,0.0094,0.085003,0.980393,0.709265,0.777778,0.651846
2,0.0113,0.079098,0.982304,0.739738,0.820821,0.673235
3,0.0106,0.085583,0.980603,0.711297,0.765766,0.664062


In [224]:
trainer.log_metrics("train", train_results.metrics)

***** train metrics *****
  epoch                    =        3.0
  total_flos               =  1714660GF
  train_loss               =     0.0104
  train_runtime            = 0:02:24.54
  train_samples_per_second =     61.414
  train_steps_per_second   =       3.84


In [225]:
output_metrics = trainer.evaluate()
trainer.log_metrics("eval", output_metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.9806
  eval_f1 score           =     0.7113
  eval_loss               =     0.0856
  eval_precision          =     0.6641
  eval_recall             =     0.7658
  eval_runtime            = 0:00:01.67
  eval_samples_per_second =    221.489
  eval_steps_per_second   =     14.367


In [None]:
model = make_contiguous(model)

In [227]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [228]:
trainer.save_model('./bert_model')

In [229]:
tokenizer.save_pretrained('./bert_tokenizer')

('./bert_tokenizer/tokenizer_config.json',
 './bert_tokenizer/special_tokens_map.json',
 './bert_tokenizer/vocab.txt',
 './bert_tokenizer/added_tokens.json',
 './bert_tokenizer/tokenizer.json')