# Classificação de Texto com Transformer - DistilBERT

In [None]:
%pip install evaluate

In [3]:
# Importando bibliotecas
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

In [4]:
# Carregar dataset IMDB
dataset = load_dataset('imdb')
dataset = dataset.shuffle(seed=42)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 101171.03 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 336367.75 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 327970.05 examples/s]


In [6]:
# Tokenização
tokenizer =  DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
def preprocess(examples):
  return tokenizer(examples['text'], truncation=True, padding=True)
encoded = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 25000/25000 [00:09<00:00, 2635.96 examples/s]
Map: 100%|██████████| 25000/25000 [00:09<00:00, 2634.35 examples/s]
Map: 100%|██████████| 50000/50000 [00:19<00:00, 2621.54 examples/s]


In [7]:
# Modelo
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Métrica
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  preds = logits.argmax(-1)
  return accuracy.compute(predictions=preds, references=labels)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


In [9]:
# Treinamento
training_args = TrainingArguments(
  output_dir="./resultados",
  evaluation_strategy="epoch",
  save_strategy="epoch",
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  num_train_epochs=2,
  weight_decay=0.01,
  load_best_model_at_end=True,
  logging_dir="./logs",
  logging_steps=10
)



In [12]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=encoded['train'].select(range(500)),
  eval_dataset=encoded['test'].select(range(100)),
  tokenizer=tokenizer,
  compute_metrics=compute_metrics
)

  trainer = Trainer(


In [13]:
# Executa o modelo
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.376,0.363116,0.85
2,0.146,0.372396,0.87


TrainOutput(global_step=126, training_loss=0.4086626332903665, metrics={'train_runtime': 986.4645, 'train_samples_per_second': 1.014, 'train_steps_per_second': 0.128, 'total_flos': 132467398656000.0, 'train_loss': 0.4086626332903665, 'epoch': 2.0})

In [14]:
# Resultado
trainer.evaluate()

{'eval_loss': 0.36311641335487366,
 'eval_accuracy': 0.85,
 'eval_runtime': 22.0048,
 'eval_samples_per_second': 4.544,
 'eval_steps_per_second': 0.591,
 'epoch': 2.0}