In [12]:
pip install ray[tune]

Note: you may need to restart the kernel to use updated packages.




In [13]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig, DataCollatorWithPadding, TrainingArguments, Trainer, TrainerCallback
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import PopulationBasedTraining

import torch
import evaluate
import os
import numpy as np
import matplotlib.pyplot as plt

In [14]:
class CFG:
    MODEL_NAME = 'roberta-large'
    
    EPOCHS = 10
    TRAIN_BATCH_SIZE = 32
    VAL_BATCH_SIZE = 64
    WEIGHT_DECAY = 0.001
    LEARNING_RATE_START = 1e-4
    WARMUP_RATIO = 0.1
    SCHEDULER_TYPE = 'cosine'
    
    SEED = 42
    
def set_seed_(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed_(CFG.SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [15]:
dataset = load_dataset('glue', 'stsb')

tokenizer = AutoTokenizer.from_pretrained(
    CFG.MODEL_NAME, 
    #max_length=CFG.MAX_LEN
)

def tokenize_function(ex):
    return tokenizer(
        ex['sentence1'], 
        ex['sentence2'], 
        truncation=True,
        add_special_tokens=True
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Reusing dataset glue (C:\Users\Ivan\.cache\huggingface\datasets\glue\stsb\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.MODEL_NAME,
    num_labels=1,
)

def freeze_encoder(model):
    for p in model.base_model.encoder.parameters():
        p.requires_grad = False

def unfreeze_encoder(model):
    for p in model.base_model.encoder.parameters():
        p.requires_grad = True
        

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

In [17]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_columns({'label':'labels'})
tokenized_datasets.set_format('torch')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1379
    })
})

In [18]:
## TRAINING WITH FROZEN WEIGHTS ##

def compute_metrics(eval_preds):
    output, labels = eval_preds
    output = output.reshape(-1)
    return {
        'pearson_r': pearsonr(output, labels)[0],
        'spearman_r': spearmanr(output, labels)[0]
    }

training_args = TrainingArguments(
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=CFG.LEARNING_RATE_START,
    per_device_train_batch_size=CFG.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=CFG.VAL_BATCH_SIZE,
    num_train_epochs=CFG.EPOCHS,
    output_dir=os.path.join('./frozen', CFG.MODEL_NAME),
    weight_decay=CFG.WEIGHT_DECAY,
    lr_scheduler_type=CFG.SCHEDULER_TYPE,
    warmup_ratio=CFG.WARMUP_RATIO,
    fp16=True
)

freeze_encoder(model)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

ValueError: Mixed precision training with AMP or APEX (`--fp16` or `--bf16`) and half precision evaluation (`--fp16_full_eval` or `--bf16_full_eval`) can only be used on CUDA devices.

In [None]:
best_trial = trainer.hyperparameter_search(
    direction='maximize',
    backend='ray',
    search_alg=HyperOptSearch(metric='objective', mode='max'),
    scheduler=PopulationBasedTraining(metric='objective', mode='max')
)

In [None]:
class LRCallback(TrainerCallback):
    def __init__(self):
        self.lr = []
    
    def on_step_begin(self, args, state, control, **kwargs):
        self.lr.append(kwargs['lr_scheduler'].get_last_lr())

        
# add training with best hyperparams
        
lr_callback = LRCallback()
trainer.add_callback(lr_callback)
train_output = trainer.train()

In [None]:
trainer.predict(tokenized_datasets['test'])