In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from ray.tune.schedulers import PopulationBasedTraining
from ray import tune
from utils import *
from parsing import parse_fine_tune
from configs import FROZEN_CFG
import os

class Args:
    model_name='bert-base',
    num_epochs=10
    train_batch_size=32
    val_batch_size=64
    weight_decay=0.001
    learning_rate_start=1e-4
    optim='adamw_torch'
    max_len=512
    seed=42
    patience=3
    warmup_ratio=0.1
    scheduler='cosine'
    pretrained_path=None

args = Args()
FROZEN_CFG.set_args(args)

set_seed_(FROZEN_CFG.SEED)

device = set_device()

dataset = load_dataset_from_huggingface(DATASET_PATH, CONFIG_NAME)
model_name = FROZEN_CFG.MODEL_NAME

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_kwargs = {
    'tokenizer': tokenizer,
    'max_len': FROZEN_CFG.MAX_LEN
}
tokenized_datasets = dataset.map(
    tokenize_function, 
    batched=True, 
    fn_kwargs=tokenizer_kwargs,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=1,
    )
    freeze_encoder(model)
    return model


training_args = TrainingArguments(
    evaluation_strategy='steps',
    eval_steps=len(dataset['train']),
    save_strategy='epoch',
    logging_strategy='epoch',
    optim=FROZEN_CFG.OPTIM,
    learning_rate=FROZEN_CFG.LEARNING_RATE_START,
    per_device_train_batch_size=FROZEN_CFG.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=FROZEN_CFG.VAL_BATCH_SIZE,
    num_train_epochs=FROZEN_CFG.EPOCHS,
    output_dir=os.path.join('../frozen', model_name),
    weight_decay=FROZEN_CFG.WEIGHT_DECAY,
    lr_scheduler_type=FROZEN_CFG.SCHEDULER,
    warmup_ratio=FROZEN_CFG.WARMUP_RATIO,
    fp16=True,
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=None,
    args=training_args,
    model_init=model_init,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=FROZEN_CFG.PATIENCE)]
)

hp_space = {
    'learning_rate': tune.choice([1e-3, 5e-4, 1e-4, 5e-5, 1e-5]),
    'per_device_train_batch_size': tune.choice([8, 16, 32]),
    'weight_decay': tune.choice([1e-2, 1e-3, 1e-4])
}

scheduler = PopulationBasedTraining(
    time_attr='training_iteration',
    mode='max',
    metric='objective',
)

best_run = trainer.hyperparameter_search(
    hp_space=lambda _: hp_space,
    direction='maximize',
    backend='ray',
    compute_objective=compute_objective,
    scheduler=scheduler,
    keep_checkpoints_num=1,
    verbose=0,
    reuse_actors=True,
    n_trials=10,
)

print('BEST TRIAL: ')
print(best_run)

print('starting finetuning with frozen encoder . . .')
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

train_output = trainer.train()


Found cached dataset glue (C:/Users/Ivan/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.