## Install requirements

In [1]:
!pip install -U -q datasets transformers evaluate torch torchinfo tokenizers sentencepiece huggingface_hub

[K     |████████████████████████████████| 441 kB 7.5 MB/s 
[K     |████████████████████████████████| 5.5 MB 49.5 MB/s 
[K     |████████████████████████████████| 72 kB 889 kB/s 
[K     |██████████████████████████████  | 834.1 MB 1.1 MB/s eta 0:00:50tcmalloc: large alloc 1147494400 bytes == 0x3afb4000 @  0x7f2786ada615 0x58ead6 0x4f355e 0x4d222f 0x51041f 0x5b4ee6 0x58ff2e 0x510325 0x5b4ee6 0x58ff2e 0x50d482 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4bac0a 0x538a76 0x590ae5 0x510280 0x5b4ee6 0x58ff2e 0x50d482 0x5b4ee6 0x58ff2e 0x50c4fc 0x58fd37 0x50ca37 0x5b4ee6 0x58ff2e
[K     |████████████████████████████████| 890.2 MB 7.0 kB/s 
[K     |████████████████████████████████| 7.6 MB 47.0 MB/s 
[K     |████████████████████████████████| 1.3 MB 60.8 MB/s 
[K     |████████████████████████████████| 163 kB 76.3 MB/s 
[K     |████████████████████████████████| 115 kB 75.2 MB/s 
[K     |████████████████████████████████| 95 kB 5.1 MB/s 
[K     |████████████████████████████████|

In [64]:
!nvidia-smi

Sat Nov  5 16:11:42 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |  15008MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Imports

In [3]:
import torch
import pandas as pd
from transformers import PreTrainedTokenizerFast, EarlyStoppingCallback, AlbertForPreTraining, AlbertForMaskedLM, AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextClassificationPipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers.optimization import get_scheduler
from datasets import Dataset, DatasetDict, ClassLabel, Value, load_dataset
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np
from torchinfo import summary
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
import evaluate
from huggingface_hub import notebook_login

In [4]:
IS_CUDA_AVAILABLE = torch.cuda.is_available()
IS_CUDA_AVAILABLE

True

In [5]:
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load datasets and create hf 🤗 `Dataset`

In [6]:
ds = load_dataset(
    'csv', 
    data_files={'train': 'train_preprocessed.csv', 'val': 'val_preprocessed.csv', 'test': 'test_preprocessed.csv'}
)

cl = ClassLabel(names=list(ds['train'].unique('label')))
ds = ds.cast_column('label', cl)

ds



  0%|          | 0/3 [00:00<?, ?it/s]



DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', 'raw'],
        num_rows: 12240
    })
    val: Dataset({
        features: ['tweet', 'label', 'raw'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tweet', 'label', 'raw'],
        num_rows: 860
    })
})

In [7]:
ds['test'].features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=['OFF', 'NOT'], id=None),
 'raw': Value(dtype='string', id=None)}

## Training LM from scratch 

In [8]:
# model_name = 'roberta-base'
# model_name = 'albert-base-v2'
# model_name = 'bert-base-uncased'
model_name = 'GroNLP/hateBERT'

### Preprocess data

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/151 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/config.json
Model config BertConfig {
  "_name_or_path": "GroNLP/hateBERT",
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/config.json
Model config BertConfig {
  "_name_or_path": "GroNLP/hateBERT",
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",


In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [37]:
def group_texts(examples):
    examples["label"] = examples["input_ids"].copy()
    return examples

In [38]:
lm_datasets = tokenized_ds.remove_columns(['label']).map(
    group_texts,
    batched=True,
)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Create model

In [39]:
if 'albert' in model_name:
    model = AlbertForMaskedLM.from_pretrained(model_name)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/config.json
Model config BertConfig {
  "_name_or_path": "GroNLP/hateBERT",
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--GroNLP--hateBERT/snapshots/f56d507e4b6a64413aff29e541e1b2178ee79d67/pytorch_model.bin
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
All model checkpoint weights were used when initializing BertLMHeadModel.

All the weights of BertLMHeadModel were initialized from the model checkpoint at GroNLP/hateBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertLMHeadModel for predictions without further training.


In [40]:
summary(model)

Layer (type:depth-idx)                                  Param #
BertLMHeadModel                                         --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─Embedding: 3-3                              1,536
│    │    └─LayerNorm: 3-4                              1,536
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             85,054,464
├─BertOnlyMLMHead: 1-2                                  --
│    └─BertLMPredictionHead: 2-3                        --
│    │    └─BertPredictionHeadTransform: 3-7            592,128
│    │    └─Linear: 3-8                                 23,471,418
Total params: 109,514,298
Trainable params: 109,514,298
Non-trainable 

### Train LM

In [42]:
training_args = TrainingArguments(
    output_dir=f"./{model_name}-results/",
    report_to='all',

    learning_rate=1e-4,
    # learning_rate=5e-5,
    lr_scheduler_type='linear',
    weight_decay=0.01,
    adam_epsilon=1e-6,
    adam_beta1=0.9,
    adam_beta2=0.98,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=28,
    num_train_epochs=16,
    warmup_steps=5,

    no_cuda=not IS_CUDA_AVAILABLE,
    # bf16=IS_CUDA_AVAILABLE,
    fp16=IS_CUDA_AVAILABLE,
    fp16_full_eval=IS_CUDA_AVAILABLE,
    
    logging_strategy='steps',
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,

    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    save_total_limit=5,

    hub_model_id=f'k4black/{model_name.replace("/", "-")}-offensive-lm-tapt',
    push_to_hub=True,
    hub_strategy='checkpoint',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['val'],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

PyTorch: setting up devices
Cloning https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12240
  Num Epochs = 16
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 24480
  Number of trainable parameters = 109514298
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,2.1709,0.078865
200,0.0521,0.020261
300,0.0225,0.00917
400,0.0126,0.005824
500,0.0103,0.003642
600,0.0332,0.002543
700,0.006,0.001616
800,0.0051,0.001586
900,0.0049,0.001464
1000,0.0047,0.001111


The following columns in the evaluation set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 28
The following columns in the evaluation set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 28
The following columns in the evaluation set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 28
The following columns in the evaluation set don't have a corresponding argument in `BertLMH

Step,Training Loss,Validation Loss
100,2.1709,0.078865
200,0.0521,0.020261
300,0.0225,0.00917
400,0.0126,0.005824
500,0.0103,0.003642
600,0.0332,0.002543
700,0.006,0.001616
800,0.0051,0.001586
900,0.0049,0.001464
1000,0.0047,0.001111


The following columns in the evaluation set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 28
Saving model checkpoint to ./GroNLP/hateBERT-results/checkpoint-2500
Configuration saved in ./GroNLP/hateBERT-results/checkpoint-2500/config.json
Model weights saved in ./GroNLP/hateBERT-results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./GroNLP/hateBERT-results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./GroNLP/hateBERT-results/checkpoint-2500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples 

KeyboardInterrupt: ignored

In [43]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: tweet. If tweet are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 28


Step,Training Loss,Validation Loss
100,2.1709,0.078865
200,0.0521,0.020261
300,0.0225,0.00917
400,0.0126,0.005824
500,0.0103,0.003642
600,0.0332,0.002543
700,0.006,0.001616
800,0.0051,0.001586
900,0.0049,0.001464
1000,0.0047,0.001111


{'eval_loss': 0.0001844609359977767}

In [44]:
trainer.save_model()

Saving model checkpoint to ./GroNLP/hateBERT-results/
Configuration saved in ./GroNLP/hateBERT-results/config.json
Model weights saved in ./GroNLP/hateBERT-results/pytorch_model.bin
tokenizer config file saved in ./GroNLP/hateBERT-results/tokenizer_config.json
Special tokens file saved in ./GroNLP/hateBERT-results/special_tokens_map.json
Saving model checkpoint to ./GroNLP/hateBERT-results/
Configuration saved in ./GroNLP/hateBERT-results/config.json
Model weights saved in ./GroNLP/hateBERT-results/pytorch_model.bin
tokenizer config file saved in ./GroNLP/hateBERT-results/tokenizer_config.json
Special tokens file saved in ./GroNLP/hateBERT-results/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/418M [00:00<?, ?B/s]

Upload file runs/Nov04_20-07-09_3ccaa5222542/events.out.tfevents.1667592439.3ccaa5222542.77.7:  18%|#7        …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt
   f51a9bc..b6fd12f  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt
   f51a9bc..b6fd12f  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt
   b6fd12f..fd3bb97  main -> main

   b6fd12f..fd3bb97  main -> main



## Finetune with classification task

In [8]:
# model_name = 'GroNLP/hateBERT'
model_name = 'bert-base-uncased'

finetuned_model_name = f'k4black/{model_name.replace("/", "-")}-offensive-lm-tapt'
# finetuned_model_name = model_name

finetuned_model_name

'k4black/bert-base-uncased-offensive-lm-tapt'

### Process data

In [9]:
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)



### Create model

In [10]:
finetune_model = AutoModelForSequenceClassification.from_pretrained(finetuned_model_name, num_labels=cl.num_classes)

Some weights of the model checkpoint at k4black/bert-base-uncased-offensive-lm-tapt were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at k

In [11]:
summary(finetune_model)

Layer (type:depth-idx)                                  Param #
BertForSequenceClassification                           --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─Embedding: 3-3                              1,536
│    │    └─LayerNorm: 3-4                              1,536
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             85,054,464
│    └─BertPooler: 2-3                                  --
│    │    └─Linear: 3-7                                 590,592
│    │    └─Tanh: 3-8                                   --
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           1,538
Total params: 10

### Evaluational functions

In [12]:
# create matrics function 

metric_f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_f1.compute(predictions=predictions, references=labels, average='macro')

### Train model

In [13]:
# create training args and run training 
# train on train data, validate on val data =)

training_args = TrainingArguments(
    output_dir=f"./finetuning-{finetuned_model_name}-results",
    report_to='all',
    
    learning_rate=1e-5,
    weight_decay=0.01,

    per_device_train_batch_size=16,
    # per_device_train_batch_size=14,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    warmup_steps=5,

    no_cuda=not IS_CUDA_AVAILABLE,
    # bf16=IS_CUDA_AVAILABLE,
    fp16=IS_CUDA_AVAILABLE,
    fp16_full_eval=IS_CUDA_AVAILABLE,
    
    logging_strategy='steps',
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,

    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    save_total_limit=1,
    save_steps=100,

    hub_model_id=finetuned_model_name,
    push_to_hub=False,
)

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tweet, raw. If tweet, raw are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12240
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3060
  Number of trainable parameters = 109483778
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
100,0.6568,0.643766,0.393204
200,0.6205,0.627013,0.393204
300,0.6106,0.564239,0.583333
400,0.5345,0.481284,0.723735
500,0.4899,0.491058,0.739626
600,0.4691,0.46364,0.756272
700,0.4605,0.459117,0.756881
800,0.4494,0.44643,0.771468
900,0.4505,0.452495,0.764583
1000,0.4355,0.446516,0.769801


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tweet, raw. If tweet, raw are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./finetuning-k4black/bert-base-uncased-offensive-lm-tapt-results/checkpoint-100
Configuration saved in ./finetuning-k4black/bert-base-uncased-offensive-lm-tapt-results/checkpoint-100/config.json
Model weights saved in ./finetuning-k4black/bert-base-uncased-offensive-lm-tapt-results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./finetuning-k4black/bert-base-uncased-offensive-lm-tapt-results/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./finetuning-k4black/bert-base-uncased-offensive-lm-tapt-results/checkpoint-100/special_tokens_map.json
The following columns in the evaluation set don't 

TrainOutput(global_step=1700, training_loss=0.46911391089944277, metrics={'train_runtime': 1270.1051, 'train_samples_per_second': 38.548, 'train_steps_per_second': 2.409, 'total_flos': 7156620705792000.0, 'train_loss': 0.46911391089944277, 'epoch': 2.22})

### Validate trained model

In [14]:
trainer.predict(tokenized_ds['val'])[-1]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tweet, raw. If tweet, raw are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 32


{'test_loss': 0.4308520555496216,
 'test_f1': 0.7829085577446537,
 'test_runtime': 11.0491,
 'test_samples_per_second': 90.505,
 'test_steps_per_second': 2.896}

In [15]:
trainer.predict(tokenized_ds['test'])[-1]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tweet, raw. If tweet, raw are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 860
  Batch size = 32


{'test_loss': 0.3844267427921295,
 'test_f1': 0.7849324073280174,
 'test_runtime': 9.6843,
 'test_samples_per_second': 88.803,
 'test_steps_per_second': 2.788}

In [52]:
trainer.save_model()

Saving model checkpoint to ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results
Configuration saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/config.json
Model weights saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/pytorch_model.bin
tokenizer config file saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/tokenizer_config.json
Special tokens file saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/special_tokens_map.json
Saving model checkpoint to ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results
Configuration saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/config.json
Model weights saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/pytorch_model.bin
tokenizer config file saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/tokenizer_config.json
Special tokens file saved in ./k4black/GroNLP-hateBERT-offensive-lm-tapt-results/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars

Upload file pytorch_model.bin:   0%|          | 3.34k/209M [00:00<?, ?B/s]

Upload file runs/Nov04_20-48-22_3ccaa5222542/events.out.tfevents.1667595082.3ccaa5222542.77.9:  24%|##4       …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt
   fd3bb97..a96c193  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt
   fd3bb97..a96c193  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'F1', 'type': 'f1', 'value': 0.7805893216461313}]}
To https://huggingface.co/k4black/GroNLP-hateBERT-offensive-lm-tapt
   a96c193..e06f34e  main -> main

   a96c193..e06f34e  main -> main

