In [None]:
!pip install accelerate datasets dvclive evaluate 'transformers[torch]' --upgrade

In [None]:
!git init -q
!git config --local user.email "you@example.com"
!git config --local user.name "Your Name"
!dvc init -q
!git commit -m "DVC init"

# Dataset

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("imdb")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

small_train_dataset = dataset["train"].shuffle(seed=42).select(range(2000)).map(tokenize_function, batched=True)
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(200)).map(tokenize_function, batched=True)


In [None]:
import numpy as np
import evaluate

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Tracking experiments with DVCLive

In [None]:
from dvclive.huggingface import DVCLiveCallback
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

for epochs in (5, 10, 15):
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=2)
    for param in model.base_model.parameters():
        param.requires_grad = False

    training_args = TrainingArguments(
        evaluation_strategy="epoch", 
        learning_rate=3e-4,
        logging_strategy="epoch",
        num_train_epochs=epochs,
        output_dir="output", 
        overwrite_output_dir=True,
        load_best_model_at_end=True,
        report_to="none",
        save_strategy="epoch",
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics,
        callbacks=[DVCLiveCallback(report="notebook", save_dvc_exp=True, log_model=True)],
    )
    trainer.train()

# Comparing

In [None]:
import dvc.api
import pandas as pd

columns = ["Experiment",  "epoch", "eval.f1"]

df = pd.DataFrame(dvc.api.exp_show(), columns=columns)

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df


In [None]:
!dvc plots diff $(dvc exp list --names-only)

In [None]:
from IPython.display import HTML
HTML(filename='./dvc_plots/index.html')