In [1]:
pip install ray[tune]

Note: you may need to restart the kernel to use updated packages.




In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig, DataCollatorWithPadding, TrainingArguments, Trainer, TrainerCallback
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import PopulationBasedTraining

import torch
import evaluate
import os
import numpy as np
import matplotlib.pyplot as plt

  from ray.tune.suggest.hyperopt import HyperOptSearch
  from ray.tune.suggest.hyperopt import HyperOptSearch


In [3]:
class CFG:
    MODEL_NAME = 'roberta-large'
    
    EPOCHS = 15
    TRAIN_BATCH_SIZE = 32
    VAL_BATCH_SIZE = 64
    WEIGHT_DECAY = 0.001
    LEARNING_RATE_START = 1e-4
    
    MAX_LEN = 512
    
    SEED = 42
    
def set_seed_(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed_(CFG.SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [20]:
dataset = load_dataset('glue', 'stsb')

dataset.remove_columns(['idx', 'label'])

tokenizer = AutoTokenizer.from_pretrained(
    CFG.MODEL_NAME, 
    max_length=CFG.MAX_LEN
)

def tokenize_function(examples):
    result = tokenizer(examples['sentence1'])
    tmp = tokenizer(examples['sentence2'])
    if tokenizer.is_fast:
        result['word_ids1'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
        result['word_ids2'] = [tmp.word_ids(i) for i in range(len(tmp['input_ids']))]
    return result

tokenized_datasets = dataset.map(
    tokenize_function, 
    batched=True, 
    remove_columns=['sentence1', 'sentence2']
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Reusing dataset glue (C:\Users\Ivan\.cache\huggingface\datasets\glue\stsb\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['idx', 'input_ids', 'attention_mask', 'word_ids1', 'word_ids2'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['idx', 'input_ids', 'attention_mask', 'word_ids1', 'word_ids2'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['idx', 'input_ids', 'attention_mask', 'word_ids1', 'word_ids2'],
        num_rows: 1379
    })
})

In [None]:
model = AutoModelForMaskedLM.from_pretrained(
    CFG.MODEL_NAME
)

model.max_length

In [None]:
printf

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_columns({'label':'labels'})
tokenized_datasets.set_format('torch')

tokenized_datasets

In [None]:
training_args = TrainingArguments(
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=CFG.LEARNING_RATE_START,
    num_train_epochs=CFG.EPOCHS,
    weight_decay=CFG.WEIGHT_DECAY,
    output_dir=os.path.join('./masked_lm', CFG.MODEL_NAME),
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

trainer.train()