In [1]:
!pip install transformers==4.28.0
!pip install -U datasets
# !pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transfor

In [5]:
import transformers
import pandas as pd
import numpy as np
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

metric = load_metric('glue', 'sst2')



def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

data = pd.read_csv("tagged_data.csv", index_col=0)
data.rename(columns={"tweet": "text", "prediction": "label"}, inplace=True)
dataset_ = Dataset.from_pandas(data)
dataset = dataset_.train_test_split(0.2)

model_checkpoint = 'distilbert-base-uncased'
batch_size = 32

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True
)
def process(x):
  return tokenizer(x['text'], truncation=True, max_length=510)

train_ds = dataset['train'].map(process)
test_ds = dataset['test'].map(process)

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

args = TrainingArguments(
    f'{model_checkpoint}_sentiment_analysis',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 5,
    weight_decay = 0.01,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy'
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

Map:   0%|          | 0/12237 [00:00<?, ? examples/s]

Map:   0%|          | 0/3060 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.28128,0.880719
2,0.331800,0.261736,0.892484
3,0.205800,0.291765,0.892157
4,0.125800,0.338843,0.890523
5,0.125800,0.382681,0.895752


TrainOutput(global_step=1915, training_loss=0.19004504848709305, metrics={'train_runtime': 299.9651, 'train_samples_per_second': 203.974, 'train_steps_per_second': 6.384, 'total_flos': 694270174293900.0, 'train_loss': 0.19004504848709305, 'epoch': 5.0})

In [6]:
trainer.evaluate(train_ds)

{'eval_loss': 0.040767982602119446,
 'eval_accuracy': 0.9883141292800524,
 'eval_runtime': 17.6353,
 'eval_samples_per_second': 693.892,
 'eval_steps_per_second': 21.718,
 'epoch': 5.0}