In [1]:
import os
from tqdm import tqdm
import numpy as np
from pathlib import Path
import argparse
import logging

import torch
from torch.utils.data import DataLoader

import wandb
from sklearn.metrics import mean_squared_error

from datasets import load_dataset, Dataset, DatasetDict

from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification, LongformerTokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available(), 'GPU not found. You should fix this.'

In [3]:
model_name_or_path = 'allenai/longformer-base-4096'
dataset_path = '../bin/content_ds.hf'
output_dir = 'results/hp-tuning'
model_max_length = 2056
eval_steps = 1000
eval_accumulation_steps = 2
save_total_limit = 4
batch_size = 8
sweep_id = None
dry_run = False
metric = 'mse'
entity = 'ai-aloe'
project_name = 'summary grader'

In [4]:
def load_dataset(dataset_path):
    ds = DatasetDict.load_from_disk(dataset_path)
    return ds



In [5]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    mse = mean_squared_error(labels, logits)

    return {'mse': mse}

In [6]:
def train():
    ''' The main training loop.
    '''
    wandb.init()
    
    config = wandb.config

    model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path,
            num_labels=1,
            hidden_dropout_prob=config.dropout,
        )
        
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy='steps',
        save_strategy='steps',
        logging_strategy='steps',
        eval_steps=eval_steps,
        save_steps=eval_steps,
        eval_accumulation_steps=eval_accumulation_steps,
        save_total_limit=save_total_limit,
        optim='adamw_torch',
        gradient_accumulation_steps=4, 
        gradient_checkpointing=True,
        learning_rate=config.learning_rate,
        num_train_epochs=config.epochs,
        weight_decay=config.weight_decay,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        load_best_model_at_end=False,
        disable_tqdm=False,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_dict['train'],
        eval_dataset=dataset_dict['valid'],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()

In [7]:
tokenizer = LongformerTokenizer.from_pretrained(
    model_name_or_path,
    max_length=model_max_length,
    )

In [9]:
dataset_dict = DatasetDict.load_from_disk(dataset_path)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16, return_tensors='pt')

if not sweep_id:
    sweep_goal = 'minimize'
        
    if dry_run:
        sweep_name = 'dry-run'
    else:
        sweep_name = f'{model_name_or_path}'
            
    sweep_config = {
        'name': sweep_name,
        'method': 'bayes',
        'metric': {
            'name': f'eval/{metric}',
            'goal': sweep_goal,
        },
        'parameters':
        {
            'epochs': {
                'values': [2, 3]
            },
            'dropout': {
                'distribution': 'uniform',
                'min': 0,
                'max': 0.2
            },
            'learning_rate': {
                'distribution': 'uniform',
                'min': 1e-5,
                'max': 2e-5,
            },
            'weight_decay': {
                'values': [0.3]
            },
        },
    }

    sweep_id = wandb.sweep(sweep_config,
                            entity=entity,
                            project=project_name)

else:
    sweep_id = sweep_id
        
wandb.agent(sweep_id, train, count=20)

Create sweep with ID: bdpy1h3m
Sweep URL: https://wandb.ai/ai-aloe/summary%20grader/sweeps/bdpy1h3m


[34m[1mwandb[0m: Agent Starting Run: wgog72l5 with config:
[34m[1mwandb[0m: 	dropout: 0.14707222329850578
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 1.7627689085208615e-05
[34m[1mwandb[0m: 	weight_decay: 0.3
[34m[1mwandb[0m: Currently logged in as: [33mtiedaar1[0m ([33mai-aloe[0m). Use [1m`wandb login --relogin`[0m to force relogin


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

Step,Training Loss,Validation Loss


0,1
train/epoch,▁
train/global_step,▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,2.98
train/global_step,306.0
train/total_flos,2.571161454128333e+16
train/train_loss,0.42086
train/train_runtime,7895.2788
train/train_samples_per_second,1.248
train/train_steps_per_second,0.039


[34m[1mwandb[0m: Agent Starting Run: e7p60a9t with config:
[34m[1mwandb[0m: 	dropout: 0.11333044665771856
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 1.6966854486891115e-05
[34m[1mwandb[0m: 	weight_decay: 0.3


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

Step,Training Loss,Validation Loss


0,1
train/epoch,▁
train/global_step,▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,2.98
train/global_step,306.0
train/total_flos,2.571161454128333e+16
train/train_loss,0.40477
train/train_runtime,7935.3666
train/train_samples_per_second,1.242
train/train_steps_per_second,0.039


[34m[1mwandb[0m: Agent Starting Run: ix8i10ft with config:
[34m[1mwandb[0m: 	dropout: 0.004479389147981206
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 1.43362508169553e-05
[34m[1mwandb[0m: 	weight_decay: 0.3


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias'

Step,Training Loss,Validation Loss


In [None]:
dataset_dict