## Install requirements

In [1]:
!pip install -U -q datasets transformers evaluate torch torchinfo pytorch-lightning tokenizers sentencepiece huggingface_hub

[K     |████████████████████████████████| 441 kB 8.4 MB/s 
[K     |████████████████████████████████| 5.3 MB 58.0 MB/s 
[K     |████████████████████████████████| 72 kB 1.4 MB/s 
[K     |████████████████████████████████| 708 kB 69.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 53.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 36.9 MB/s 
[K     |████████████████████████████████| 163 kB 69.7 MB/s 
[K     |████████████████████████████████| 212 kB 57.5 MB/s 
[K     |████████████████████████████████| 115 kB 11.5 MB/s 
[K     |████████████████████████████████| 127 kB 56.3 MB/s 
[K     |████████████████████████████████| 529 kB 51.5 MB/s 
[?25h

In [3]:
!nvidia-smi

Sun Oct 16 16:53:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Imports

In [4]:
import torch
import pandas as pd
from transformers import PreTrainedTokenizerFast, EarlyStoppingCallback, AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextClassificationPipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, ClassLabel, Value, load_dataset
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np
from torchinfo import summary
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
import evaluate
from huggingface_hub import notebook_login

In [5]:
IS_CUDA_AVAILABLE = torch.cuda.is_available()
IS_CUDA_AVAILABLE

True

In [6]:
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


## Load datasets and create hf 🤗 `Dataset`

In [7]:
ds = load_dataset(
    'csv', 
    data_files={'train': 'train.csv', 'val': 'val.csv', 'test': 'test.csv'}
)

cl = ClassLabel(names=list(ds['train'].unique('label')))
ds = ds.cast_column('label', cl).remove_columns(['label_sentiment'])

ds



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ce5e8dfc34ae613f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ce5e8dfc34ae613f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'id', 'text'],
        num_rows: 4220
    })
    val: Dataset({
        features: ['label', 'id', 'text'],
        num_rows: 880
    })
    test: Dataset({
        features: ['label', 'id', 'text'],
        num_rows: 900
    })
})

In [8]:
ds['test'].features

{'label': ClassLabel(names=['dvd', 'books', 'camera', 'health', 'software', 'music'], id=None),
 'id': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

## Encoder only models

### Select model

In [9]:
# select one of the models to finetune

# model_name = 'bert-base-uncased'
# model_name = 'xlm-roberta-base'
# model_name = 'albert-base-v2'
model_name = 'distilbert-base-uncased'

### Preprocess data

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'id', 'text', 'input_ids', 'attention_mask'],
        num_rows: 4220
    })
    val: Dataset({
        features: ['label', 'id', 'text', 'input_ids', 'attention_mask'],
        num_rows: 880
    })
    test: Dataset({
        features: ['label', 'id', 'text', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
})

### Create model

In [12]:
# load pretrained model from HuggingFace Model Hub
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=cl.num_classes)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

In [13]:
# freeze some layers if needed 

# for name, param in model.named_parameters():
#     if 'embeddings' in name:
#         param.requires_grad = False

#     if 'encoder' in name:
#         param.requires_grad = False

In [14]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           4,614
├─Dropout: 1-4                                          --
Total params: 66,958,086
Trainable params: 66,958,086
Non-trainable params: 0

### Evaluation functions

In [15]:
# create matrics function 

metric_acc = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

### Train model

In [None]:
# create training args and run training 
# train on train data, validate on val data =)

training_args = TrainingArguments(
    output_dir="./results",
    report_to='all',
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    no_cuda=not IS_CUDA_AVAILABLE,
    # bf16=IS_CUDA_AVAILABLE,
    fp16=IS_CUDA_AVAILABLE,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### Validate trained model

In [21]:
trainer.predict(tokenized_ds['test'])[-1]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 900
  Batch size = 32


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0668,0.244372,0.939773,0.939408
2,0.0469,0.25845,0.942045,0.941744


{'test_loss': 0.2570228576660156,
 'test_accuracy': 0.9433333333333334,
 'test_f1': 0.9432863785846166,
 'test_runtime': 5.5329,
 'test_samples_per_second': 162.663,
 'test_steps_per_second': 5.241}

### Push to hub

In [34]:
id2label = {cl.str2int(i): i for i in cl.names}
label2id = {v: k for k, v in id2label.items()}
id2label

{0: 'dvd', 1: 'books', 2: 'camera', 3: 'health', 4: 'software', 5: 'music'}

In [35]:
model.config.update({
    'id2label': id2label,
    'label2id': label2id,
})
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "dvd",
    "1": "books",
    "2": "camera",
    "3": "health",
    "4": "software",
    "5": "music"
  },
  "initializer_range": 0.02,
  "label2id": {
    "books": 1,
    "camera": 2,
    "dvd": 0,
    "health": 3,
    "music": 5,
    "software": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

In [36]:
hub_name = 'distilbert-base-uncased-reviews-finetuned'
tokenizer.push_to_hub(hub_name)
model.push_to_hub(hub_name)

tokenizer config file saved in /tmp/tmp2neppujh/tokenizer_config.json
Special tokens file saved in /tmp/tmp2neppujh/special_tokens_map.json
Uploading the following files to k4black/distilbert-base-uncased-reviews-finetuned: tokenizer.json,special_tokens_map.json,tokenizer_config.json,vocab.txt
Configuration saved in /tmp/tmp9d142mgt/config.json
Model weights saved in /tmp/tmp9d142mgt/pytorch_model.bin
Uploading the following files to k4black/distilbert-base-uncased-reviews-finetuned: pytorch_model.bin,config.json


CommitInfo(commit_url='https://huggingface.co/k4black/distilbert-base-uncased-reviews-finetuned/commit/9574a1005ce3ab3712bd3a050685e8ec83ba8c99', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='9574a1005ce3ab3712bd3a050685e8ec83ba8c99', pr_url=None, pr_revision=None, pr_num=None)

## Training LM from scratch 

In [None]:
model_name = 'roberta-base'

### Train tokenizer

In [None]:
unique_words = set()
for text in ds['train']['text']:
    unique_words.update(text.split())
len(unique_words)

32420

In [None]:
# load pretrained to copy settings 
old_tokenizer = AutoTokenizer.from_pretrained(model_name)
old_tokenizer

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(
    ds['train']['text'], 
    show_progress=True,
    vocab_size=25_000, 
)
tokenizer

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=25000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [None]:
tokenizer('I love you')

{'input_ids': [0, 45, 712, 327, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/custom-model.zip custom-model.zip
!unzip custom-model.zip

In [None]:
tokenizer = AutoTokenizer.from_pretrained('custom-model')

### Preprocess data

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
def group_texts(examples):
    examples["label"] = examples["input_ids"].copy()
    return examples

In [None]:
lm_datasets = tokenized_ds.remove_columns(['label']).map(
    group_texts,
    batched=True,
)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Create model

In [None]:
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_config(config)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [40]:
model = AutoModelForCausalLM.from_pretrained('custom-model')

loading configuration file custom-model/config.json
Model config RobertaConfig {
  "_name_or_path": "custom-model",
  "architectures": [
    "RobertaForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file custom-model/pytorch_model.bin
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
All model checkpoint weights were used when initializing RobertaForCausalLM.

All the weights of RobertaForCausalLM 

In [None]:
summary(model)

Layer (type:depth-idx)                                       Param #
RobertaForCausalLM                                           --
├─RobertaModel: 1-1                                          --
│    └─RobertaEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   38,603,520
│    │    └─Embedding: 3-2                                   394,752
│    │    └─Embedding: 3-3                                   768
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─RobertaEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,054,464
├─RobertaLMHead: 1-2                                         --
│    └─Linear: 2-3                                           590,592
│    └─LayerNorm: 2-4                                        1,536
│    └─Linear: 2-5                                           38,65

### Train LM

In [None]:
training_args = TrainingArguments(
    output_dir="./results-lm/",
    report_to='all',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=32,
    weight_decay=0.01,
    no_cuda=not IS_CUDA_AVAILABLE,
    # bf16=IS_CUDA_AVAILABLE,
    fp16=IS_CUDA_AVAILABLE,
    logging_strategy='steps',
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    save_total_limit=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['val'],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `RobertaForCausalLM.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4220
  Num Epochs = 32
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16896
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,0.7208,0.791032
200,0.7038,0.75689
300,0.7163,0.723797
400,0.6566,0.695602
500,0.6886,0.667445
600,0.6141,0.640812
700,0.564,0.60981
800,0.5605,0.575085
900,0.5741,0.537643
1000,0.4974,0.505667


The following columns in the evaluation set don't have a corresponding argument in `RobertaForCausalLM.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 880
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForCausalLM.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 880
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForCausalLM.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 880
  Batch size = 16
The following columns in the evaluation set don't have a c

TrainOutput(global_step=16896, training_loss=0.10809235852635042, metrics={'train_runtime': 10899.9015, 'train_samples_per_second': 12.389, 'train_steps_per_second': 1.55, 'total_flos': 3.555136816349184e+16, 'train_loss': 0.10809235852635042, 'epoch': 32.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForCausalLM.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 880
  Batch size = 16


{'eval_loss': 0.014477109536528587,
 'eval_runtime': 16.0399,
 'eval_samples_per_second': 54.863,
 'eval_steps_per_second': 3.429,
 'epoch': 32.0}

### Save model and push to hub

In [None]:
trainer.save_model('custom-model')

Saving model checkpoint to custom-model
Configuration saved in custom-model/config.json
Model weights saved in custom-model/pytorch_model.bin
tokenizer config file saved in custom-model/tokenizer_config.json
Special tokens file saved in custom-model/special_tokens_map.json


In [None]:
!rm -rf custom-model.zip
!zip -r custom-model.zip custom-model

In [None]:
!ls -lah

total 447M
drwxr-xr-x 1 root root 4.0K Oct 13 18:04 .
drwxr-xr-x 1 root root 4.0K Oct 13 14:55 ..
drwxr-xr-x 4 root root 4.0K Oct  7 13:34 .config
drwxr-xr-x 2 root root 4.0K Oct 13 12:12 custom-model
-rw-r--r-- 1 root root 442M Oct 13 18:04 custom-model.zip
drwx------ 5 root root 4.0K Oct 13 14:59 drive
drwxr-xr-x 8 root root 4.0K Oct 13 17:59 results-lm
drwxr-xr-x 1 root root 4.0K Oct  7 13:35 sample_data
-rw-r--r-- 1 root root 701K Oct 13 14:57 test.csv
-rw-r--r-- 1 root root 3.2M Oct 13 14:57 train.csv
-rw-r--r-- 1 root root 669K Oct 13 14:57 val.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp custom-model.zip /content/drive/MyDrive
!ls -lah | grep custom-model.zip

In [42]:
hub_name = 'roberta-reviews-lm'
tokenizer.push_to_hub(hub_name)
model.push_to_hub(hub_name)

tokenizer config file saved in /tmp/tmpyeidnpz8/tokenizer_config.json
Special tokens file saved in /tmp/tmpyeidnpz8/special_tokens_map.json
Uploading the following files to k4black/roberta-reviews-lm: merges.txt,vocab.json,tokenizer.json,special_tokens_map.json,tokenizer_config.json
Configuration saved in /tmp/tmp8op61pzb/config.json
Model weights saved in /tmp/tmp8op61pzb/pytorch_model.bin
Uploading the following files to k4black/roberta-reviews-lm: pytorch_model.bin,config.json


CommitInfo(commit_url='https://huggingface.co/k4black/roberta-reviews-lm/commit/6aaa5fd72e2a2424c514ec8c01ff7583014c8766', commit_message='Upload RobertaForCausalLM', commit_description='', oid='6aaa5fd72e2a2424c514ec8c01ff7583014c8766', pr_url=None, pr_revision=None, pr_num=None)

### Finetune with classification task

In [37]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/custom-model.zip custom-model.zip

Mounted at /content/drive


In [38]:
!unzip custom-model.zip

Archive:  custom-model.zip
   creating: custom-model/
  inflating: custom-model/merges.txt  
  inflating: custom-model/vocab.json  
  inflating: custom-model/tokenizer.json  
  inflating: custom-model/training_args.bin  
  inflating: custom-model/special_tokens_map.json  
  inflating: custom-model/tokenizer_config.json  
  inflating: custom-model/config.json  
  inflating: custom-model/pytorch_model.bin  


### Process data

In [39]:
tokenizer = AutoTokenizer.from_pretrained('custom-model')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

#### Create model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('custom-model', num_labels=cl.num_classes)

loading configuration file custom-model/config.json
Model config RobertaConfig {
  "_name_or_path": "custom-model",
  "architectures": [
    "RobertaForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vo

In [None]:
# freeze some layers
for name, param in model.named_parameters():
#     if 'embeddings' in name:
#         param.requires_grad = False

    if 'encoder' in name:
        param.requires_grad = False

In [None]:
summary(model)

Layer (type:depth-idx)                                       Param #
RobertaForSequenceClassification                             --
├─RobertaModel: 1-1                                          --
│    └─RobertaEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   38,603,520
│    │    └─Embedding: 3-2                                   394,752
│    │    └─Embedding: 3-3                                   768
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─RobertaEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  (85,054,464)
├─RobertaClassificationHead: 1-2                             --
│    └─Linear: 2-3                                           590,592
│    └─Dropout: 2-4                                          --
│    └─Linear: 2-5                                           4,614


#### Evaluational functions

In [None]:
# create matrics function 

metric_acc = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

#### Train model

In [None]:
# create training args and run training 
# train on train data, validate on val data =)

training_args = TrainingArguments(
    output_dir="./results",
    report_to='all',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    no_cuda=not IS_CUDA_AVAILABLE,
    # bf16=IS_CUDA_AVAILABLE,
    fp16=IS_CUDA_AVAILABLE,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4220
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2112


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.7926,1.767094,0.230682,0.219178
2,1.7665,1.753293,0.225,0.201646
3,1.7492,1.744324,0.227273,0.204233
4,1.7344,1.735245,0.234091,0.227024
5,1.722,1.728845,0.2375,0.225881


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 880
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 880


KeyboardInterrupt: ignored

#### Validate trained model

In [None]:
trainer.predict(tokenized_ds['test'])[-1]

## Zero-Shot Classification

### Select model

In [None]:
zs_model_name = 'facebook/bart-large-mnli'

### Create pipeline

In [None]:
from transformers import pipeline

zs_model = pipeline('zero-shot-classification', model=zs_model_name, device=0)

In [None]:
%%time
predict = []
for out in tqdm(zs_model(KeyDataset(ds['test'], 'text'), list(cl.names), batch_size=16), total=len(ds['test'])):
    predict.append(out['labels'][0])

len(predict)

100%|██████████| 900/900 [09:35<00:00,  1.56it/s]

CPU times: user 8min 53s, sys: 25.2 s, total: 9min 18s
Wall time: 9min 35s





900

### Evaluation

In [None]:
metric_f1 = evaluate.load('f1')
metric_acc = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
labels = KeyDataset(ds['test'], 'label')
_predict = [cl.str2int(p) for p in predict]
{
    **metric_acc.compute(predictions=_predict, references=labels),
    **metric_f1.compute(predictions=_predict, references=labels, average='macro'),
}

{'accuracy': 0.7511111111111111, 'f1': 0.7367648519861256}