# Experiment Parameters

In [15]:
dino_dataset_names = ["headlines-naive-7600-dataset.jsonl", "headlines-processed-human-dataset.jsonl"]
model_name = "distilbert-base-uncased" # bert-base-uncased
num_examples = 6200
shrink_eval = -1
batch_sz = 32
num_epochs = 2
num_evals_per_epoch = 3
learning_rate = 2e-4

# Installs

In [2]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0

# Git

In [3]:
!git clone https://github.com/jasony123123/dino

Cloning into 'dino'...
remote: Enumerating objects: 151, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 151 (delta 20), reused 26 (delta 10), pack-reused 109[K
Receiving objects: 100% (151/151), 2.81 MiB | 6.52 MiB/s, done.
Resolving deltas: 100% (66/66), done.


In [4]:
%cd dino/datasets

/content/dino/datasets


# Load Datasets

In [5]:
from datasets import load_dataset

real_dataset = load_dataset("ag_news")

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
import json
import datasets

def read_jsonl_dataset(filename):
  train_dataset = datasets.load_dataset("json", data_files=filename)
  labels = {"world": 0, "sports": 1, "business": 2, "scitech": 3}
  mapping = lambda example: {"text": example["text_a"], "label": labels[example["label"]]}

  train_dataset = train_dataset.map(mapping, remove_columns=["text_a","text_b"])
  final_dataset = datasets.DatasetDict({"train": train_dataset["train"]})
  return final_dataset

dino_datasets = [(read_jsonl_dataset(fname), fname) for fname in dino_dataset_names]

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-e79ce5570f83d5dc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-e79ce5570f83d5dc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/7637 [00:00<?, ? examples/s]

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-8d5fa669fdfd83a5/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-8d5fa669fdfd83a5/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/6235 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_real_datasets = real_dataset.map(tokenize_function, batched=True)
tokenized_dino_datasets = [
    (data.map(tokenize_function, batched=True), name)
    for data, name in dino_datasets
]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/7637 [00:00<?, ? examples/s]

Map:   0%|          | 0/6235 [00:00<?, ? examples/s]

# Experiment

In [16]:
small_eval_real_dataset = tokenized_real_datasets["test"].shuffle(seed=42)
if shrink_eval > 0:
  small_eval_real_dataset = small_eval_real_dataset.select(range(shrink_eval))

null_real_dataset = tokenized_real_datasets["train"].shuffle(seed=42).select(range(1))
small_train_real_dataset = tokenized_real_datasets["train"].shuffle(seed=42).select(range(num_examples))
small_train_dino_datasets = [
    (data["train"].shuffle(seed=42).select(range(num_examples)), name)
    for data, name in tokenized_dino_datasets
]



In [17]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

eval_steps = (num_examples / batch_sz) // num_evals_per_epoch
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="steps",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_sz,
    per_device_eval_batch_size=batch_sz,
    learning_rate=learning_rate,
    eval_steps=eval_steps,
    logging_strategy="steps",
    logging_steps=eval_steps,
    report_to="none",
)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [18]:
training_datasets = [
    # (null_real_dataset, "none"),
    (small_train_real_dataset, "real"),
    *small_train_dino_datasets
]

for train_dataset, name in training_datasets:
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=small_eval_real_dataset,
      compute_metrics=compute_metrics,
  )

  print("------------------")
  print(name)
  print("------------------")
  trainer.train()

------------------
real
------------------




Step,Training Loss,Validation Loss,Accuracy
64,0.5183,0.441936,0.851711
128,0.3638,0.330317,0.893421
192,0.34,0.303116,0.899342
256,0.2064,0.296652,0.901316
320,0.1878,0.309522,0.908816
384,0.1724,0.292556,0.911974


------------------
headlines-naive-7600-dataset.jsonl
------------------




Step,Training Loss,Validation Loss,Accuracy
64,0.0698,1.651127,0.753289
128,0.0225,1.890238,0.679868
192,0.0006,2.422738,0.71
256,0.0002,2.638039,0.710132
320,0.0051,2.49994,0.719737
384,0.0001,2.491539,0.721447


------------------
headlines-processed-human-dataset.jsonl
------------------




Step,Training Loss,Validation Loss,Accuracy
64,0.1075,1.290099,0.845263
128,0.025,1.28469,0.814868
192,0.0099,1.629167,0.776579
256,0.0016,1.665134,0.810132
320,0.0001,1.70569,0.809868
384,0.0001,1.716198,0.81


<a id='pytorch_native'></a>

<a id='additional-resources'></a>