In [20]:
import torch
from os import path as op
import os
import numpy as np
from collections import Counter

# Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`
!pip install accelerate -U

# transformers complained about newset version 0.0.13 so installing the older version
# ! pip install huggingface-hub==0.0.12

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [19]:
np.__version__


'2.1.1'

In [13]:
from evaluate import load


In [25]:

# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "distilbert-base-uncased"
BATCH_SIZE = 25

snli_data = load_dataset("snli")
print(Counter(snli_data['train']['label']))

# SNLI data needs to be cleaned as it contains -1s as a label
for k in snli_data:
    snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )

metric = load('glue', "mnli")

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)

# tokenize the data
encoded_snli_data = snli_data.map(preprocess_function, batched=True, load_from_cache_file=True)

# load a model and prepare it for 3-way classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

Counter({0: 183416, 2: 183187, 1: 182764, -1: 785})


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    MODEL_DIR, # to save models
    # evaluation_strategy = "epoch", # 1 epoch for training takes too long for colab
    evaluation_strategy = "steps",
    eval_steps = 500, # evaluate and save after training on every next 500x16 examples
    save_steps=500, # saves model after every 500 steps. save_steps should be divisible on eval_steps
    learning_rate=2e-5,
    per_device_train_batch_size=18,
    per_device_eval_batch_size=18,
    num_train_epochs=1, # going throught the training data only once
    weight_decay=0.01,
    load_best_model_at_end=True, # after fine-tuning trainer.model will keep the best model
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_snli_data["train"],
    eval_dataset=encoded_snli_data["validation"],
    # You could use "test" here but it will be cheating then
    # to select the model checkpoint which gets highest score on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
# it takes ~32min to fine-tune one epoch on the training set (550K problems) on V100
# it takes ~45min to fine-tune one epoch on the training set (550K problems) on T4

# if colab timeouts after one evaluation (i.e., training on 5000x16),
# you will still have a model in $MODEL_DIR/checkpoint-5000
# you can load that model and continue fine-tuning on the remaining problems
# note that the first 5000x16 problems will be skipped
trainer.train(op.jopin(MODEL_DIR, 'checkpoint-5000'))

  trainer = Trainer(
  0%|          | 11/78481 [10:48<1285:31:02, 58.98s/it]
  0%|          | 12/30521 [00:48<34:03:57,  4.02s/it]

KeyboardInterrupt: 

In [None]:
trainer_eval = Trainer(
    trainer.model, # model that you want to evaluate, In this case this is the best model based on the fine-tuning
    args,
    train_dataset=encoded_snli_data["train"],
    eval_dataset=encoded_snli_data["validation"], # you want to evaluate on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_eval.evaluate() . 