In [1]:
import tokenizers
from datetime import datetime
import numpy as np
import evaluate
import pandas as pd
from datasets import load_dataset, load_from_disk
import transformers
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, LineByLineTextDataset, BertConfig, BertForMaskedLM, DataCollatorWithPadding, Trainer, TrainingArguments, BertForSequenceClassification,
                        AutoConfig, BertModel, BertForPreTraining, DataCollatorForLanguageModeling, pipeline, AutoModelForSequenceClassification)

In [2]:
dataset = load_from_disk('../CLIdata/datasets/cuneiform-train-val-test/')

In [22]:
novo_treinamento_tokenizer = False
if novo_treinamento_tokenizer:
    vocab_size = 1000 #sao 1360 caracteres cuneiformes em unicode no total
    max_len = 512
    original_bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    #treinamento do tokenizer com o arquivo de caracteres
    my_tokenizer = original_bert_tokenizer.train_new_from_iterator(dataset['train']['text'] + dataset['val']['text'] + dataset['test']['text'],vocab_size = vocab_size)
    my_tokenizer.save_pretrained("../tokenizers/bert-base-uncased_train_val_test_maxlen_512_vocab_size_1000")

loading configuration file config.json from cache at C:\Users\igorr/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\igorr/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\vocab.t

In [23]:
#abre arquivo com caracteres individuais
#with open('../CLIdata/single-char/single-chars.txt',encoding='utf8') as f:
#    single_chars = f.read().split('\n')

In [3]:
my_tokenizer = AutoTokenizer.from_pretrained('../tokenizers/bert-base-uncased_train_val_test_maxlen_512_vocab_size_1000')
vocab_size = my_tokenizer.vocab_size
max_len = my_tokenizer.model_max_length

In [6]:
def tokenize_function(example):
    output = my_tokenizer(example["text"], truncation=True, max_length=max_len)
    input_batch = []
    for token_id in output['input_ids']:
        input_batch.append(token_id)
    return {"input_ids": input_batch}


In [7]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

Loading cached processed dataset at ../CLIdata/datasets/cuneiform-spaced-indexed/train\cache-64b98f50809e5ebd.arrow


  0%|          | 0/10 [00:00<?, ?ba/s]

Loading cached processed dataset at ../CLIdata/datasets/cuneiform-spaced-indexed/test\cache-e4fe1c576f25c696.arrow


In [8]:
#data_collator = DataCollatorWithPadding(tokenizer=my_tokenizer)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=my_tokenizer, mlm=True, mlm_probability=0.15)

In [9]:
config = AutoConfig.from_pretrained('bert-base-uncased',vocab_size=vocab_size,num_hidden_layers=12, max_position_embeddings=max_len)
model = BertForMaskedLM(config)

In [10]:
dia = datetime.today().strftime("%Y-%M-%d")
hora = datetime.now().strftime("%H-%M")

training_args = TrainingArguments(
    output_dir=f'../checkpoints/meu_output_dia_{dia}_hora_{hora}',
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    learning_rate=1e-5,
    logging_strategy='steps',
    logging_steps=1e3,
    save_strategy='steps',
    save_steps=10_000, 
    max_steps= 500_000 #200_000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator, #o default data_collator ja era o DataCollatorWithPadding, entao nao era neceessario ter criado um data_collator anteriomente
    train_dataset=tokenized_dataset['train']
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()

***** Running training *****
  Num examples = 82017
  Num Epochs = 25
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 500000
  Number of trainable parameters = 86473520


  0%|          | 0/500000 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 5.1533, 'learning_rate': 9.980000000000001e-06, 'epoch': 0.05}
{'loss': 4.8829, 'learning_rate': 9.960000000000001e-06, 'epoch': 0.1}
{'loss': 4.68, 'learning_rate': 9.940000000000001e-06, 'epoch': 0.15}
{'loss': 4.5654, 'learning_rate': 9.920000000000002e-06, 'epoch': 0.2}
{'loss': 4.5089, 'learning_rate': 9.9e-06, 'epoch': 0.24}
{'loss': 4.4408, 'learning_rate': 9.88e-06, 'epoch': 0.29}
{'loss': 4.3799, 'learning_rate': 9.86e-06, 'epoch': 0.34}
{'loss': 4.4248, 'learning_rate': 9.84e-06, 'epoch': 0.39}
{'loss': 4.3516, 'learning_rate': 9.820000000000001e-06, 'epoch': 0.44}


Saving model checkpoint to ../checkpoints/meu_output_dia_2022-11-11_hora_05-11\checkpoint-10000
Configuration saved in ../checkpoints/meu_output_dia_2022-11-11_hora_05-11\checkpoint-10000\config.json


{'loss': 4.3836, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.49}


Model weights saved in ../checkpoints/meu_output_dia_2022-11-11_hora_05-11\checkpoint-10000\pytorch_model.bin


{'loss': 4.3354, 'learning_rate': 9.780000000000001e-06, 'epoch': 0.54}
{'loss': 4.3264, 'learning_rate': 9.760000000000001e-06, 'epoch': 0.59}
{'loss': 4.3798, 'learning_rate': 9.74e-06, 'epoch': 0.63}


RuntimeError: CUDA out of memory. Tried to allocate 44.00 MiB (GPU 0; 4.00 GiB total capacity; 3.23 GiB already allocated; 0 bytes free; 3.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Inferência no modelo pre treinado

In [4]:
my_tokenizer.tokenize('𒀳 𒊹 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁')

['𒀳',
 '𒊹',
 '𒉺',
 '𒊹',
 '𒊕',
 '𒁺',
 '𒀭',
 '𒍣',
 '𒄿',
 '𒉺',
 '𒀭',
 '𒍣',
 '𒄿',
 '𒌋',
 '𒌀',
 '𒀭',
 '𒍣',
 '𒄿',
 '𒋁']

In [6]:

classifier = pipeline('fill-mask',
                      model='..\checkpoints-albert\standard_2022-45-13_hora_22-45\checkpoint-790000',
                      tokenizer=my_tokenizer)
#classifier = pipeline('fill-mask',model='../checkpoints/BERT-MLM/checkpoint-40000',tokenizer=my_tokenizer)

In [11]:
simb = '𒋁'
classifier('𒀳 [MASK] 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁')

[{'score': 0.6733809113502502,
  'token': 411,
  'token_str': '𒊹',
  'sequence': '𒀳 𒊹 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁'},
 {'score': 0.03319213166832924,
  'token': 32,
  'token_str': '𒀭',
  'sequence': '𒀳 𒀭 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁'},
 {'score': 0.02297208644449711,
  'token': 467,
  'token_str': '𒌋',
  'sequence': '𒀳 𒌋 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁'},
 {'score': 0.022430120036005974,
  'token': 351,
  'token_str': '𒉌',
  'sequence': '𒀳 𒉌 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁'},
 {'score': 0.019839348271489143,
  'token': 160,
  'token_str': '𒄑',
  'sequence': '𒀳 𒄑 𒉺 𒊹 𒊕 𒁺 𒀭 𒍣 𒄿 𒉺 𒀭 𒍣 𒄿 𒌋 𒌀 𒀭 𒍣 𒄿 𒋁'}]

## Fine tunig

In [25]:
metric = evaluate.load('accuracy')

In [28]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [35]:
classifier = AutoModelForSequenceClassification.from_pretrained('../checkpoints/BERT-MLM-Colab-Batch64/checkpoint-60000/', num_labels=7)

loading configuration file ../checkpoints/BERT-MLM-Colab-Batch64/checkpoint-60000/config.json
Model config BertConfig {
  "_name_or_path": "../checkpoints/BERT-MLM-Colab-Batch64/checkpoint-60000/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torc

In [36]:
#verificar porque no tokenize_function de cima se exclui colunas
def tokenize_function2(examples):
    return my_tokenizer(examples['text'], padding="max_length", truncation=True)

In [37]:
tokenized_dataset2 = dataset.map(tokenize_function2, batched=True)

  0%|          | 0/83 [00:00<?, ?ba/s]

Loading cached processed dataset at ../CLIdata/datasets/cuneiform-spaced-indexed/val\cache-a655d64ef7f3df32.arrow
Loading cached processed dataset at ../CLIdata/datasets/cuneiform-spaced-indexed/test\cache-f3c5bc878b25cbd8.arrow


In [42]:
dia = datetime.today().strftime("%Y-%M-%d")
hora = datetime.now().strftime("%H-%M")
training_args2 = TrainingArguments(
    output_dir=f'../checkpoints_finetuning/meu_output_dia_{dia}_hora_{hora}',
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    logging_strategy='steps',
    logging_steps=1e3,
    save_strategy='steps',
    save_steps=10_000, 
    max_steps= 100_000, #200_000
    evaluation_strategy="epoch"
)

trainer2 = Trainer(
    model=classifier,
    args=training_args2,
    #data_collator=data_collator_padding, #o default data_collator ja era o DataCollatorWithPadding, entao nao era neceessario ter criado um data_collator anteriomente
    train_dataset=tokenized_dataset2['train'],
    eval_dataset=tokenized_dataset2['val'],
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs


In [43]:
trainer2.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 82017
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 100000
  Number of trainable parameters = 86476807


  0%|          | 0/100000 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 4.00 GiB total capacity; 3.27 GiB already allocated; 0 bytes free; 3.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF