## Install requirements

In [None]:
!pip install -U -q datasets transformers evaluate torch torchinfo pytorch-lightning tokenizers

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Imports

In [None]:
import torch
import pandas as pd
from transformers import PreTrainedTokenizerFast, EarlyStoppingCallback, AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextClassificationPipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, ClassLabel, Value, load_dataset
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np
from torchinfo import summary
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
import evaluate

In [None]:
IS_CUDA_AVAILABLE = torch.cuda.is_available()
IS_CUDA_AVAILABLE

False

## Load datasets

In [None]:
ds = load_dataset(
    'csv', 
    data_files={'train': '/content/gdrive/MyDrive/LFD-3/train.csv', 'val': '/content/gdrive/MyDrive/LFD-3/val.csv', 'test': '/content/gdrive/MyDrive/LFD-3/test.csv'}
)

cl = ClassLabel(names=list(ds['train'].unique('label')))
ds = ds.cast_column('label', cl).remove_columns(['label_sentiment'])

ds



  0%|          | 0/3 [00:00<?, ?it/s]



DatasetDict({
    train: Dataset({
        features: ['label', 'id', 'text'],
        num_rows: 4220
    })
    val: Dataset({
        features: ['label', 'id', 'text'],
        num_rows: 880
    })
    test: Dataset({
        features: ['label', 'id', 'text'],
        num_rows: 900
    })
})

In [None]:
ds['test'].features

{'label': ClassLabel(names=['dvd', 'books', 'camera', 'health', 'software', 'music'], id=None),
 'id': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

## Encoder only models

### Select model

In [None]:
model_name = 'xlm-roberta-base'

### Preprocess data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length',max_length=256, truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)



### Create model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=cl.num_classes)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [None]:
# freeze some layers
for name, param in model.named_parameters():
    if 'embeddings' in name:
        param.requires_grad = False

    # if 'encoder' in name:
    #     param.requires_grad = False

In [None]:
summary(model)

Layer (type:depth-idx)                                       Param #
XLMRobertaForSequenceClassification                          --
├─RobertaModel: 1-1                                          --
│    └─RobertaEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   (192,001,536)
│    │    └─Embedding: 3-2                                   (394,752)
│    │    └─Embedding: 3-3                                   (768)
│    │    └─LayerNorm: 3-4                                   (1,536)
│    │    └─Dropout: 3-5                                     --
│    └─RobertaEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,054,464
├─RobertaClassificationHead: 1-2                             --
│    └─Linear: 2-3                                           590,592
│    └─Dropout: 2-4                                          --
│    └─Linear: 2-5                                          

### Evaluation functions

In [None]:
# create matrics function 

metric_f1 = evaluate.load('f1')
metric_acc = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        **metric_acc.compute(predictions=predictions, references=labels),
        **metric_f1.compute(predictions=predictions, references=labels, average='micro'),
    }

### Train model

In [None]:
# create training args and run training 
# train on train data, validate on val data =)

training_args = TrainingArguments(
    output_dir="./results",
    report_to='all',
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    no_cuda=not IS_CUDA_AVAILABLE,
    # bf16=IS_CUDA_AVAILABLE,
    fp16=IS_CUDA_AVAILABLE,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    load_best_model_at_end=True,
    save_strategy='epoch',
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.add_callback(EarlyStoppingCallback(3))
trainer.train()

The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4220
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 264
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Validate trained model

In [None]:
trainer.predict(tokenized_ds['test'])[-1]

The following columns in the test set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 900
  Batch size = 32


{'test_loss': 0.25266391038894653,
 'test_accuracy': 0.9322222222222222,
 'test_f1': 0.9322222222222222,
 'test_runtime': 2.0983,
 'test_samples_per_second': 428.927,
 'test_steps_per_second': 13.821}