In [60]:
from pathlib import Path
DATA_DIR = Path.cwd().parent / 'data'

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import evaluate
MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Sklearn
from sklearn.metrics import precision_recall_fscore_support

# PyTorch
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /home/jovyan/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /home/jovyan/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/733bade19e5f0ce98e6531

In [None]:
def tokenize(batch):
    # Turns text into a series of numbers.
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=TOKENIZER.max_model_input_sizes[MODEL_NAME])

datadict = DatasetDict.load_from_disk(DATA_DIR / 'efcamdat_dataset')
datadict = datadict.map(tokenize, num_proc=8)

In [83]:
def model_init():
    model = RobertaForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=6,
    ).cuda()    
    
    labels = enumerate(datadict['train'].features['labels'].names)
    model.config.id2label = {idnum: label for idnum, label in labels}
    model.config.label2id = {label: idnum for idnum, label in labels}

    return model

In [81]:
# def compute_metrics(pred):
#     metrics = ['precision', 'recall', 'f1', 'support']
#     results = precision_recall_fscore_support(pred.label_ids,
#                                               pred.predictions.argmax(-1),
#                                               labels=list(l1_codes.values()),
#                                               zero_division=0,
#                                              )
#     return {k: v for k, v in zip(metrics, results)}

# clf_metrics = evaluate.combine(['accuracy', 'f1', 'recall', 'precision'])
accuracy = evaluate.load('accuracy')

def compute_metrics(pred):
    return accuracy.compute(references=pred.label_ids,
                            predictions=pred.predictions.argmax(-1),
                           )

In [82]:
args = TrainingArguments(
    output_dir=Path.cwd().parent / 'bin',
    evaluation_strategy='steps',
    eval_steps=10000,
    learning_rate=3e-05,
    weight_decay=0.01, # Devlin et al. suggested
    num_train_epochs=3,
    seed=42, # apriori
    logging_strategy='epoch',
    save_strategy='no',
    fp16=True,
    load_best_model_at_end=False,
)

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=datadict['train'],
    eval_dataset=datadict['dev'],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range

In [65]:
import json
import logging
import time
from itertools import product

logfile = 'hyperparameter_search.log'
logging.basicConfig(level=logging.INFO,
                    filename=logfile,
                    format='%(asctime)s -- %(message)s')

param_grid = { # Ranges suggested by Devlin et al.
    'learning_rate': [2e-5, 3e-5, 5e-5],
    'num_train_epochs': [2, 3, 4], 
    'batch_size': [16, 32],
}

logging.info('New Hyperparameter Search')
logging.info(json.dumps(param_grid))

cartesian_product = [dict(zip(param_grid.keys(), values)) for 
                     values in product(*param_grid.values())]

for i, params in enumerate(cartesian_product):
    for k, v in params.items():
        setattr(trainer.args, k, v)

    start = time.time()
    trainer.train()
    time_elapsed = time.time() - start

    preds = trainer.predict(datadict['dev'])
    acc = accuracy.compute(references=preds.label_ids,
                            predictions=preds.predictions.argmax(-1))['accuracy']

    print(', '.join([f'{k}: {v}' for k, v in params.items()]), end=' ')
    print(f'--> Accuracy: {acc:.3f}')
    print(f'Time elapsed: {time_elapsed:.0f} seconds')

    params['trial'] = i
    params['accuracy'] = acc
    params['seconds'] = time_elapsed

    logging.info(json.dumps(params))

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# Visualize Hyperparameter Search

In [None]:
sns.set(rc={'figure.figsize':(11,8)})

def get_hp_search_df(logfile):
    with open(logfile) as f:
        param_results = [json.loads(line.split('--')[1].strip())
                         for line in f.readlines()[-18:]]

    return pd.DataFrame.from_records(param_results, index='trial')


def plot_results(results_df, target='accuracy', save_fig=None):
    ax = sns.lineplot(data=results_df, x='num_train_epochs', y=target, hue='learning_rate', style='batch_size')

    Y_SCALE = np.linspace(.75, .85, num=6)
    ax.set_title('Results of Hyperparameter Grid Search')
    ax.set_ylabel('Accuracy')
    ax.set_xlabel('Number of Epochs')
    ax.set(ylim=(.78, .84))
    # ax.set_yticks(np.linspace(.30, .40, num=3))
    ax.set_xticks([2, 3, 4])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Parameters')
    fig = ax.get_figure()
    plt.tight_layout()
    plt.show()   
    if save_fig:
        fig.savefig(save_fig, dpi=300)
        
plot_results(get_hp_search_df('hyperparameter_search.log'),
             save_fig='../images/hyperparameter_search.png'
            )

# Train with best hyperparameters

In [55]:
import wandb
from transformers import TrainingArguments, Trainer
wandb.init(project="test-project", entity="ai-aloe")

# Choose best hyperparameters, train model
setattr(trainer.args, 'report_to', 'wandb')
setattr(trainer.args, 'learning_rate', 2e-5)
setattr(trainer.args, 'num_train_epochs', 4)
setattr(trainer.args, 'batch_size', 16)

trainer.train()
trainer.save_model('../bin/efcamdat_nli_1')

[34m[1mwandb[0m: Currently logged in as: [33mlangdon[0m ([33mai-aloe[0m). Use [1m`wandb login --relogin`[0m to force relogin


loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden

Step,Training Loss,Validation Loss,Accuracy
10000,No log,0.690911,0.744736
20000,0.800500,0.686268,0.774451
30000,0.800500,0.5786,0.801223
40000,0.545600,0.632001,0.812429
50000,0.545600,0.653051,0.815542
60000,0.405300,0.750267,0.824315
70000,0.405300,0.783191,0.827994


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: grade, writing_id, text, cefr, learner_id. If grade, writing_id, text, cefr, learner_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17668
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: grade, writing_id, text, cefr, learner_id. If grade, writing_id, text, cefr, learner_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17668
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: grade, writing_id, text, cefr, le