In [1]:
pip install ray[tune]
pip install accelerate

Note: you may need to restart the kernel to use updated packages.




In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer, TrainerCallback
from torch.optim import AdamW
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.schedulers import PopulationBasedTraining
from datasets import load_dataset

import datasets
import torch
import evaluate
import os
import numpy as np
import matplotlib.pyplot as plt

In [3]:
class CFG:
    MODEL_NAME = 'roberta-large'
    
    EPOCHS = 15
    TRAIN_BATCH_SIZE = 1
    VAL_BATCH_SIZE = 2
    WEIGHT_DECAY = 0.001
    LEARNING_RATE_START = 1e-4
    
    #MAX_LEN = 512
    BLOCK_SIZE = 10
    
    SEED = 42
    
def set_seed_(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed_(CFG.SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
dataset = load_dataset('glue', 'stsb')

def get_all_sentences_list(dataset, split):
    return dataset[split]['sentence1'][:] + dataset[split]['sentence2'][:]

all_sentences_train = get_all_sentences_list(dataset, 'train')
all_sentences_validation = get_all_sentences_list(dataset, 'validation')
all_sentences_test = get_all_sentences_list(dataset, 'test')

dataset_mlm = datasets.DatasetDict({
    'train': datasets.Dataset.from_dict({'sentence': all_sentences_train}),
    'validation': datasets.Dataset.from_dict({'sentence': all_sentences_validation}),
    'test': datasets.Dataset.from_dict({'sentence': all_sentences_test})
})

Reusing dataset glue (C:\Users\Ivan\.cache\huggingface\datasets\glue\stsb\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    CFG.MODEL_NAME, 
    #max_length=CFG.MAX_LEN
)

def tokenize_function(examples):
    result = tokenizer(examples['sentence'])
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result


tokenized_datasets = dataset_mlm.map(
    tokenize_function, 
    batched=True, 
    remove_columns=['sentence']
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

tokenized_datasets



  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 11498
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 2758
    })
})

In [6]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop
    total_length = (total_length // CFG.BLOCK_SIZE) * CFG.BLOCK_SIZE
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + CFG.BLOCK_SIZE] for i in range(0, total_length, CFG.BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_labeled_datasets = tokenized_datasets.map(
    group_texts,
    batched=True
)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [7]:
model = AutoModelForMaskedLM.from_pretrained(
    CFG.MODEL_NAME
)

In [11]:
training_args = TrainingArguments(
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=CFG.LEARNING_RATE_START,
    num_train_epochs=CFG.EPOCHS,
    weight_decay=CFG.WEIGHT_DECAY,
    output_dir=os.path.join('./masked_lm', CFG.MODEL_NAME),
    fp16=True,
    #auto_find_batch_size=True,
    #gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
NOTE: Redirects are currently not supported in Windows or MacOs.
The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11498
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 21570


Epoch,Training Loss,Validation Loss


The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11498
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 43125
The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11498
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 86235
The followin

RuntimeError: No executable batch size found, reached zero.

In [None]:
test_preds = trainer.predict(tokenized_datasets['test'])