Fine-tuning RoBERTa: embedding layers frozen, 4 epochs with a learning rate of 5e-6 and weight decay of 0.01 and a batch size 32. Data were truncated at 512 tokens.

# Install

In [None]:
!pip install -U -q datasets transformers torchinfo

[K     |████████████████████████████████| 441 kB 14.1 MB/s 
[K     |████████████████████████████████| 5.3 MB 57.1 MB/s 
[K     |████████████████████████████████| 212 kB 63.5 MB/s 
[K     |████████████████████████████████| 115 kB 69.0 MB/s 
[K     |████████████████████████████████| 163 kB 65.1 MB/s 
[K     |████████████████████████████████| 127 kB 52.5 MB/s 
[K     |████████████████████████████████| 7.6 MB 25.9 MB/s 
[K     |████████████████████████████████| 115 kB 39.3 MB/s 
[?25h

# Imports

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel, Value, load_dataset, load_metric
from huggingface_hub import notebook_login
from torchinfo import summary
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

# Load Data

In [None]:
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
ds = load_dataset('sara-nabhani/lfd-proj',
    'csv', 
    data_files={'train': 'train.csv', 'val': 'val.csv', 'test': 'test.csv'}
)

cl = ClassLabel(names=list(ds['train'].unique('label')))
ds = ds.cast_column('label', cl)

ds



Downloading and preparing dataset csv/sara-nabhani--lfd-proj to /root/.cache/huggingface/datasets/sara-nabhani___csv/sara-nabhani--lfd-proj-a35fbd5b9bbbc3d0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/sara-nabhani___csv/sara-nabhani--lfd-proj-a35fbd5b9bbbc3d0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 12240
    })
    val: Dataset({
        features: ['tweet', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 860
    })
})

In [None]:
ds['train'].features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=['OFF', 'NOT'], id=None)}

In [None]:
ds['test'].features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=['OFF', 'NOT'], id=None)}

In [None]:
ds['val'].features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=['OFF', 'NOT'], id=None)}

In [None]:
import torch
torch.cuda.is_available()

True

# Preprocessing

In [None]:
model_id = 'roberta-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Load model and retrain

In [None]:
metric = load_metric("accuracy")
# evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
for name, param in model.named_parameters():
    print(name)
    if 'embeddings' in name:
        param.requires_grad = False

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

In [None]:
training_args = TrainingArguments(    
  output_dir="./results",
  report_to='all',
  learning_rate=5e-6,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=32,
  num_train_epochs=4,
  weight_decay=0.01,
  no_cuda=False,
  # bf16=IS_CUDA_AVAILABLE,
  fp16=True,
  evaluation_strategy='epoch',
  logging_strategy='epoch',
)
  
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_ds['train'],
  eval_dataset=tokenized_ds['val'],
  compute_metrics=compute_metrics,
  tokenizer=tokenizer,
  data_collator=data_collator,
)
trainer.train()

PyTorch: setting up devices
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12240
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3060


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5085,0.444456,0.814
2,0.421,0.432985,0.819
3,0.387,0.434918,0.819
4,0.3623,0.446252,0.813


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens

TrainOutput(global_step=3060, training_loss=0.4196815839779922, metrics={'train_runtime': 1818.2839, 'train_samples_per_second': 26.926, 'train_steps_per_second': 1.683, 'total_flos': 1.28819172704256e+16, 'train_loss': 0.4196815839779922, 'epoch': 4.0})

# Evaluation


In [None]:
print("Prediction on validation data:")
print(trainer.predict(tokenized_ds['val'])[-1])

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 32


Prediction on validation data:


{'test_loss': 0.4462519586086273, 'test_accuracy': 0.813, 'test_runtime': 11.8766, 'test_samples_per_second': 84.199, 'test_steps_per_second': 2.694}


In [None]:
print("Prediction on test data:")
print(trainer.predict(tokenized_ds['test'])[-1])

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 860
  Batch size = 32


Prediction on test data:


{'test_loss': 0.3676394522190094, 'test_accuracy': 0.8383720930232558, 'test_runtime': 10.3783, 'test_samples_per_second': 82.865, 'test_steps_per_second': 2.602}


# Model summary

In [None]:
print("model config")
print(model.config)
print("model summary")
print(summary(model))

model config
RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

model summary
Layer (type:depth-idx)                                       Param #
RobertaForSequenceClassification                             --
├─RobertaModel: 1-1                                          --
│    └─RobertaEmbeddings: 

# Pushing and saving output

In [None]:
id2label = {cl.str2int(i): i for i in cl.names}
label2id = {v: k for k, v in id2label.items()}
id2label

{0: 'OFF', 1: 'NOT'}

In [None]:
model.config.update({
    'id2label': id2label,
    'label2id': label2id,
})
model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "OFF",
    "1": "NOT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NOT": 1,
    "OFF": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id="roberta-base-finetuned-1", private=True)

'https://huggingface.co/sara-nabhani/roberta-base-fintuned-1'

In [None]:
hub_name = 'sara-nabhani/roberta-base-finetuned-1'
tokenizer.push_to_hub(hub_name)
model.push_to_hub(hub_name)

tokenizer config file saved in /tmp/tmpab0cv4zc/tokenizer_config.json
Special tokens file saved in /tmp/tmpab0cv4zc/special_tokens_map.json
Uploading the following files to sara-nabhani/roberta-base-fintuned-1: tokenizer.json,tokenizer_config.json,merges.txt,vocab.json,special_tokens_map.json
Configuration saved in /tmp/tmp67yf8z8j/config.json
Model weights saved in /tmp/tmp67yf8z8j/pytorch_model.bin
Uploading the following files to sara-nabhani/roberta-base-fintuned-1: config.json,pytorch_model.bin


CommitInfo(commit_url='https://huggingface.co/sara-nabhani/roberta-base-fintuned-1/commit/b114e5622dc427a4716db260f316dc17a8425aff', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='b114e5622dc427a4716db260f316dc17a8425aff', pr_url=None, pr_revision=None, pr_num=None)