## Loading the data

In [2]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 5.1 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 57.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.7 MB/s 
Collecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.7 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 67.5 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64

In [3]:
from google.colab import files
 
 
uploaded = files.upload()

Saving OpArticles_ADUs.xlsx to OpArticles_ADUs.xlsx


In [4]:
import pandas as pd
import io 
from sklearn import preprocessing

dataset_ADU = pd.read_excel(io.BytesIO(uploaded['OpArticles_ADUs.xlsx']))

label_encoder = preprocessing.LabelEncoder()

dataset_ADU['label']= label_encoder.fit_transform(dataset_ADU['label'])

#subs = {"Value": 1, "Fact": 2, "Value(-)": 3, "Value(+)": 4, "Policy": 5}
#dataset_ADU['label'] = list(map(subs.get, dataset_ADU['label']))
dataset_ADU.pop("article_id")
dataset_ADU.pop("annotator")
dataset_ADU.pop("node")
dataset_ADU.pop("ranges")

dataset_ADU.head()

Unnamed: 0,tokens,label
0,O facto não é apenas fruto da ignorância,2
1,havia no seu humor mais jornalismo (mais inves...,2
2,É tudo cómico na FIFA,2
3,o que todos nós permitimos que esta organizaçã...,2
4,não nos fazem rir à custa dos poderosos,2


In [5]:
from datasets import Dataset

dataset_hf = Dataset.from_pandas(dataset_ADU)

dataset_hf

Dataset({
    features: ['tokens', 'label'],
    num_rows: 16743
})

In [6]:
from datasets import DatasetDict

train_test = dataset_hf.train_test_split(test_size=0.1)

valid_test = train_test['test'].train_test_split(test_size=0.5)

train_valid_test_dataset = DatasetDict({
  'train': train_test['train'],
  'validation': valid_test['train'],
  'test': valid_test['test']
})

## Finetuning a pretained model

In [7]:
model_name = "distilbert-base-uncased"

In [8]:
from transformers import  AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
def preprocess_function(sample):
    return tokenizer(sample["tokens"], truncation=True, padding=True)

In [10]:
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

tokenized_dataset

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'label', 'input_ids', 'attention_mask'],
        num_rows: 15068
    })
    validation: Dataset({
        features: ['tokens', 'label', 'input_ids', 'attention_mask'],
        num_rows: 837
    })
    test: Dataset({
        features: ['tokens', 'label', 'input_ids', 'attention_mask'],
        num_rows: 838
    })
})

In [11]:
from transformers import AutoModel

model = AutoModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

## Fine Tuning

In [13]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15068
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2826


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3164,1.219462,0.530466
2,1.1826,1.160884,0.541219
3,1.0756,1.139285,0.543608


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 837
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-942
Configuration saved in ./results/checkpoint-942/config.json
Model weights saved in ./results/checkpoint-942/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-942/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-942/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=2826, training_loss=1.1778769955429218, metrics={'train_runtime': 748.1648, 'train_samples_per_second': 60.42, 'train_steps_per_second': 3.777, 'total_flos': 2172877564143120.0, 'train_loss': 1.1778769955429218, 'epoch': 3.0})

In [15]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 837
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.5436081242532855,
 'eval_loss': 1.1392854452133179,
 'eval_runtime': 3.2085,
 'eval_samples_per_second': 260.869,
 'eval_steps_per_second': 16.519}

In [16]:
trainer.predict(test_dataset=tokenized_dataset["test"])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 838
  Batch size = 16


PredictionOutput(predictions=array([[-0.4928343 , -2.3612316 ,  1.5820063 , -0.64311326,  0.8292114 ],
       [ 3.225784  , -1.4999766 ,  0.25685135, -0.7282769 , -0.86445844],
       [ 0.06863871, -2.4536676 ,  1.4180769 , -0.85461545,  0.9804938 ],
       ...,
       [-0.6869396 , -1.6058493 ,  1.1362314 , -0.80493987,  0.8138253 ],
       [-0.2679285 , -2.1098473 ,  1.2597389 , -0.80082273,  1.2182502 ],
       [ 0.753648  , -2.8182068 ,  1.9539738 , -0.8347323 ,  0.17567222]],
      dtype=float32), label_ids=array([2, 0, 0, 0, 0, 3, 2, 4, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 4, 4, 4,
       4, 3, 2, 0, 2, 2, 1, 2, 0, 2, 4, 0, 2, 1, 2, 0, 0, 2, 0, 0, 0, 0,
       2, 0, 3, 0, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 0, 0, 2, 2, 2, 4, 2, 0,
       3, 2, 0, 0, 2, 4, 2, 3, 0, 2, 4, 0, 2, 4, 2, 0, 2, 2, 0, 3, 4, 2,
       0, 0, 4, 2, 0, 3, 4, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0,
       2, 3, 2, 0, 0, 4, 2, 0, 3, 2, 4, 2, 4, 2, 0, 0, 2, 3, 0, 2, 2, 4,
       3, 4, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2