In [None]:
from pathlib import Path
DATA_DIR = Path.cwd().parent / 'data'

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

# Huggingface
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import evaluate
MODEL_NAME = 'roberta-base'
TOKENIZER = RobertaTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Sklearn
from sklearn.metrics import precision_recall_fscore_support

# PyTorch
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

In [13]:
def tokenize(batch):
    # Turns text into a series of numbers.
    return TOKENIZER(batch['text'],
                     truncation=True,
                     max_length=TOKENIZER.max_model_input_sizes[MODEL_NAME])

def columns_to_device(dataset):
    columns = ['input_ids', 'token_type_ids', 'attention_mask', 'label']
    columns = list(set(columns).intersection(dataset.column_names))
    dataset.set_format(type='torch', columns=columns, device=DEVICE)

In [14]:
datadict = DatasetDict.load_from_disk(DATA_DIR / 'efcamdat_dataset')

In [None]:
datadict = datadict.map(tokenize, num_proc=8)

In [30]:
def model_init():
    return RobertaForSequenceClassification.from_pretrained(MODEL_NAME,
                                                            num_labels=6,
                                                           ).cuda()

# def compute_metrics(pred):
#     metrics = ['precision', 'recall', 'f1', 'support']
#     results = precision_recall_fscore_support(pred.label_ids,
#                                               pred.predictions.argmax(-1),
#                                               labels=list(l1_codes.values()),
#                                               zero_division=0,
#                                              )
#     return {k: v for k, v in zip(metrics, results)}
clf_metrics = evaluate.combine(['accuracy', 'f1', 'recall', 'precision'])
accuracy = evaluate.load('accuracy')
def compute_metrics(pred):
    return accuracy.compute(references=pred.label_ids,
                               predictions=pred.predictions.argmax(-1),
                              )

args = TrainingArguments(
    output_dir=Path.cwd().parent / 'bin',
    evaluation_strategy='steps',
    eval_steps=10000,
    learning_rate=3e-05,
    weight_decay=0.01, # Devlin et al. suggested
    num_train_epochs=3,
    seed=42, # apriori
    logging_strategy='epoch',
    save_strategy='no',
    fp16=True,
    load_best_model_at_end=False,
)

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=datadict['train'],
    eval_dataset=datadict['dev'],
    tokenizer=TOKENIZER,
    data_collator=DataCollatorWithPadding(tokenizer=TOKENIZER),
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range

In [None]:
# in case the process gets interrupted, it would be nice to keep some results
import json
import logging
import time
from itertools import product

logfile = 'hyperparameter_search.log'
logging.basicConfig(level=logging.INFO,
                    filename=logfile,
                    format='%(asctime)s -- %(message)s')

param_grid = {
    'learning_rate': [2e-5, 3e-5, 5e-5], # Range suggested by Devlin et al.
    'num_train_epochs': [2, 3, 4], # Range suggested by Devlin et al.
    'batch_size': [16, 32],
}

logging.info('New Hyperparameter Search')
logging.info(json.dumps(param_grid))

# Since we have 2 settings with 3 values each, we will finetune 9 times
cartesian_product = [dict(zip(param_grid.keys(), values)) for 
                     values in product(*param_grid.values())]

for i, params in enumerate(cartesian_product):
    for k, v in params.items():
        setattr(trainer.args, k, v)

    start = time.time()
    trainer.train()
    time_elapsed = time.time() - start

    preds = trainer.predict(datadict['dev'])
    acc = accuracy(preds.label_ids, preds.predictions)

    print(', '.join([f'{k}: {v}' for k, v in params.items()]), end=' ')
    print(f'--> Accuracy: {acc:.3f}')
    print(f'Time elapsed: {time_elapsed:.0f} seconds')

    params['trial'] = i
#     params['accuracy'] = acc
    params['seconds'] = time_elapsed

    logging.info(json.dumps(params))

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden

Step,Training Loss,Validation Loss
