# Hyperparameter Search

In [1]:
# Import classes for tokenization and model training
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)

# Import DatasetDict which will help us prepare our own dataset for use in training and evaulating machine learning models
from datasets import DatasetDict

# Import function to be used as loss function
from sklearn.metrics import mean_squared_error

# Import library to track our training runs and change settings
import wandb

# Replace the variables below with your own: name, project name, and project directory
%env WANDB_ENTITY = langdon
%env WANDB_PROJECT = ellipse
%env WANDB_DIR = /home/jovyan/active-projects/ellipse-methods-showcase/bin

score_to_predict = 'Grammar'
model_names = ['bert-base-uncased', 'xlm-roberta-base', 'microsoft/deberta-base']

env: WANDB_ENTITY=langdon
env: WANDB_PROJECT=ellipse
env: WANDB_DIR=/home/jovyan/active-projects/ellipse-methods-showcase/bin


In [2]:
def get_datadict(score_to_predict):
    ''' Selects a target score that the model should predict and renames that score to 'label'.
    Removes other columns from the dataset. The other columns are not needed for training.
    '''
    
    # These columns will be removed from the dataset
    scores = {
        'Overall',
        'Cohesion',
        'Syntax',
        'Vocabulary',
        'Phraseology',
        'Grammar',
        'Conventions'
    }
    
    columns_to_remove = scores.symmetric_difference([score_to_predict])
    
    # Load the DatasetDict object we created in the previous notebook. 
    # We will be removing the columns that we defined above, and renaming the target column (=score_to_predict) into 'label'
    dd = (DatasetDict
          .load_from_disk('../data/ellipse.hf')
          .remove_columns(columns_to_remove)
          .rename_column(score_to_predict, 'label') # Huggingface will look for a column that contains the string 'label' to calculate metrics.
         )
    
    return dd

# Load dataset using the function
datadict = get_datadict(score_to_predict)

In [3]:
# Create a function that will help us evaluate the model's performance by calculating the mean squared error of the model's predictions
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    mse = mean_squared_error(labels, preds)

    return {'mse': mse}

In [4]:
def sweep(score_to_predict, model_name):

    tokenizer = AutoTokenizer.from_pretrained(model_name)   
    
    def model_init(trial):
        return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

    def tokenize_inputs(example):
        return tokenizer(example['text'], max_length=512, truncation=True)

    sweep_config = {
        'name': f'{score_to_predict}-{model_name}-optimization',
        'method': 'bayes',
        'metric': {  # we want to "minimize" the mean squared error.
            'name': 'eval/mse',
            'goal': 'minimize'
        },
        'parameters': {
            'batch_size': {'value': 16},
            'learning_rate': {'values': [1e-5, 2e-5, 3e-5, 4e-5]},
            'warmup_steps': {'values': [50, 100, 500, 1000]},
            'weight_decay': {'values': [0.01, 0.1]},
            'epochs': {'values': [2, 3, 4, 5]},
        },
        'run_cap': 12,
    }

    dd_tokenized = datadict.map(tokenize_inputs, remove_columns=['text_id', 'text'])

    def train():
        with wandb.init():
            # set sweep configuration
            config = wandb.config
    
            # Customize the trainer
            training_args = TrainingArguments(
                output_dir = f'../bin/{score_to_predict}-{model_name}', 
                optim = 'adamw_torch', # Specify your optimizer
                logging_dir = f'../logs/{score_to_predict}-{model_name}', 
                load_best_model_at_end = True,
                metric_for_best_model = 'mse', # We will be using mean squared error to evaluate model performance
                evaluation_strategy='epoch', # Evaluate model performance at the end of each epoch
                save_strategy='epoch',
                save_total_limit=1,
                greater_is_better = False,
                log_level = 'error',
                disable_tqdm = False,
                report_to='wandb',
                # The hyper parameters we are tuning (umber of epochs, learning rate, and batch size) are called in from the configuration dictionary
                warmup_steps=config.warmup_steps,
                weight_decay=config.weight_decay,
                num_train_epochs=config.epochs, 
                learning_rate=config.learning_rate,
                per_device_train_batch_size=config.batch_size,
                per_device_eval_batch_size=16,
            )
    
            # Initialize the trainer
            trainer = Trainer(
                model=None, # this is to emphasize that we are not passing the model directly
                args=training_args,
                train_dataset=dd_tokenized['train'],
                eval_dataset=dd_tokenized['dev'],
                compute_metrics=compute_metrics,
                tokenizer=tokenizer,
                model_init=model_init, # we pass a function that initializes the model afresh at the start of each trial
            )
    
    
            # Start training loop
            trainer.train()

    sweep_id = wandb.sweep(sweep_config)
    wandb.agent(sweep_id, train)

In [None]:
for model_name in model_names:
    sweep(score_to_predict, model_name)

Map:   0%|          | 0/4537 [00:00<?, ? examples/s]

Map:   0%|          | 0/972 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Create sweep with ID: xhklmi1i
Sweep URL: https://wandb.ai/langdon/ellipse/sweeps/xhklmi1i


[34m[1mwandb[0m: Agent Starting Run: dnrdrl61 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	warmup_steps: 100
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: Currently logged in as: [33mlangdon[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.299717,0.299717
2,0.853600,0.288016,0.288016


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▁
eval/mse,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▆██
train/global_step,▁▆██
train/learning_rate,▁
train/loss,▁
train/total_flos,▁

0,1
eval/loss,0.28802
eval/mse,0.28802
eval/runtime,8.6431
eval/samples_per_second,112.459
eval/steps_per_second,7.058
train/epoch,2.0
train/global_step,568.0
train/learning_rate,0.0
train/loss,0.8536
train/total_flos,2387448280209408.0


[34m[1mwandb[0m: Agent Starting Run: hgxgcabu with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.32525,0.32525
2,1.278200,0.243918,0.243918
3,1.278200,0.249291,0.249291


0,1
eval/loss,█▁▁
eval/mse,█▁▁
eval/runtime,█▃▁
eval/samples_per_second,▁▆█
eval/steps_per_second,▁▆█
train/epoch,▁▄▅██
train/global_step,▁▄▄██
train/learning_rate,▁
train/loss,▁
train/total_flos,▁

0,1
eval/loss,0.24929
eval/mse,0.24929
eval/runtime,8.5905
eval/samples_per_second,113.148
eval/steps_per_second,7.101
train/epoch,3.0
train/global_step,852.0
train/learning_rate,3e-05
train/loss,1.2782
train/total_flos,3581172420314112.0


[34m[1mwandb[0m: Agent Starting Run: 6ocj7mnm with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.32525,0.32525
2,1.278200,0.243918,0.243918
3,1.278200,0.249291,0.249291


VBox(children=(Label(value='0.008 MB of 0.025 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.304581…

0,1
eval/loss,█▁▁
eval/mse,█▁▁
eval/runtime,█▁▇
eval/samples_per_second,▁█▂
eval/steps_per_second,▁█▂
train/epoch,▁▄▅██
train/global_step,▁▄▄██
train/learning_rate,▁
train/loss,▁
train/total_flos,▁

0,1
eval/loss,0.24929
eval/mse,0.24929
eval/runtime,8.6737
eval/samples_per_second,112.063
eval/steps_per_second,7.033
train/epoch,3.0
train/global_step,852.0
train/learning_rate,3e-05
train/loss,1.2782
train/total_flos,3581172420314112.0


[34m[1mwandb[0m: Agent Starting Run: ovqttfp8 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.32525,0.32525
2,1.278200,0.245018,0.245018
3,1.278200,0.226023,0.226023
4,0.187300,0.267523,0.267523


VBox(children=(Label(value='0.008 MB of 0.034 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.221792…

0,1
eval/loss,█▂▁▄
eval/mse,█▂▁▄
eval/runtime,▃█▁▄
eval/samples_per_second,▆▁█▅
eval/steps_per_second,▆▁█▅
train/epoch,▁▃▃▆▇██
train/global_step,▁▃▃▆▇██
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.26752
eval/mse,0.26752
eval/runtime,8.5915
eval/samples_per_second,113.135
eval/steps_per_second,7.1
train/epoch,4.0
train/global_step,1136.0
train/learning_rate,1e-05
train/loss,0.1873
train/total_flos,4774896560418816.0


[34m[1mwandb[0m: Agent Starting Run: xpdyt3fu with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	warmup_steps: 1000
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.31809,0.318091
2,1.889700,0.239904,0.239904
3,1.889700,0.242581,0.242581
4,0.228800,0.294178,0.294178


0,1
eval/loss,█▁▁▆
eval/mse,█▁▁▆
eval/runtime,▁▃▆█
eval/samples_per_second,█▆▃▁
eval/steps_per_second,█▆▃▁
train/epoch,▁▃▃▆▇██
train/global_step,▁▃▃▆▇██
train/learning_rate,▁█
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.29418
eval/mse,0.29418
eval/runtime,8.6623
eval/samples_per_second,112.21
eval/steps_per_second,7.042
train/epoch,4.0
train/global_step,1136.0
train/learning_rate,2e-05
train/loss,0.2288
train/total_flos,4774896560418816.0


[34m[1mwandb[0m: Agent Starting Run: pk1pphx8 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.260739,0.260739
2,1.166400,0.235311,0.235311
3,1.166400,0.228858,0.228858
4,0.185100,0.266664,0.266664


0,1
eval/loss,▇▂▁█
eval/mse,▇▂▁█
eval/runtime,█▅▁▁
eval/samples_per_second,▁▄██
eval/steps_per_second,▁▄██
train/epoch,▁▃▃▆▇██
train/global_step,▁▃▃▆▇██
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.26666
eval/mse,0.26666
eval/runtime,8.5821
eval/samples_per_second,113.259
eval/steps_per_second,7.108
train/epoch,4.0
train/global_step,1136.0
train/learning_rate,1e-05
train/loss,0.1851
train/total_flos,4774896560418816.0


[34m[1mwandb[0m: Agent Starting Run: vy3m6mzn with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	warmup_steps: 1000
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.338499,0.338499
2,1.471100,0.246296,0.246296
3,1.471100,0.340717,0.340717


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▁█
eval/mse,█▁█
eval/runtime,█▁▂
eval/samples_per_second,▁█▇
eval/steps_per_second,▁█▇
train/epoch,▁▄▅██
train/global_step,▁▄▄██
train/learning_rate,▁
train/loss,▁
train/total_flos,▁

0,1
eval/loss,0.34072
eval/mse,0.34072
eval/runtime,8.6018
eval/samples_per_second,113.0
eval/steps_per_second,7.092
train/epoch,3.0
train/global_step,852.0
train/learning_rate,2e-05
train/loss,1.4711
train/total_flos,3581172420314112.0


[34m[1mwandb[0m: Agent Starting Run: yit6rm3x with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.260739,0.260739
2,1.166400,0.265327,0.265327
3,1.166400,0.263716,0.263716


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▁█▆
eval/mse,▁█▆
eval/runtime,▁█▃
eval/samples_per_second,█▁▆
eval/steps_per_second,█▁▆
train/epoch,▁▄▅██
train/global_step,▁▄▄██
train/learning_rate,▁
train/loss,▁
train/total_flos,▁

0,1
eval/loss,0.26372
eval/mse,0.26372
eval/runtime,8.616
eval/samples_per_second,112.814
eval/steps_per_second,7.08
train/epoch,3.0
train/global_step,852.0
train/learning_rate,4e-05
train/loss,1.1664
train/total_flos,3581172420314112.0


[34m[1mwandb[0m: Agent Starting Run: 5p24tldc with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	warmup_steps: 1000
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.317762,0.317762
2,1.626400,0.351743,0.351743
3,1.626400,0.234449,0.234449
4,0.222200,0.276499,0.276499


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▆█▁▄
eval/mse,▆█▁▄
eval/runtime,▆▅█▁
eval/samples_per_second,▃▄▁█
eval/steps_per_second,▃▄▁█
train/epoch,▁▃▃▆▇██
train/global_step,▁▃▃▆▇██
train/learning_rate,▁█
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.2765
eval/mse,0.2765
eval/runtime,8.5861
eval/samples_per_second,113.206
eval/steps_per_second,7.104
train/epoch,4.0
train/global_step,1136.0
train/learning_rate,3e-05
train/loss,0.2222
train/total_flos,4774896560418816.0


[34m[1mwandb[0m: Agent Starting Run: 9e7mlxhi with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.260739,0.260739
2,1.166400,0.241199,0.241199
3,1.166400,0.245228,0.245228
4,0.187300,0.267402,0.267402
5,0.187300,0.269173,0.269173


VBox(children=(Label(value='0.008 MB of 0.024 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.311229…

0,1
eval/loss,▆▁▂██
eval/mse,▆▁▂██
eval/runtime,█▂▆█▁
eval/samples_per_second,▁▇▃▁█
eval/steps_per_second,▁▇▃▁█
train/epoch,▁▂▃▅▅▆██
train/global_step,▁▂▃▄▅▆██
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.26917
eval/mse,0.26917
eval/runtime,8.5752
eval/samples_per_second,113.35
eval/steps_per_second,7.114
train/epoch,5.0
train/global_step,1420.0
train/learning_rate,2e-05
train/loss,0.1873
train/total_flos,5968612478376864.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jwgfemyf with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	warmup_steps: 1000
[34m[1mwandb[0m: 	weight_decay: 0.1


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113411222403455, max=1.0…



Epoch,Training Loss,Validation Loss,Mse
1,No log,0.338499,0.338499
2,1.471100,0.246296,0.246296
3,1.471100,0.340717,0.340717
4,0.226300,0.333902,0.333902
5,0.226300,0.273716,0.273716


0,1
eval/loss,█▁█▇▃
eval/mse,█▁█▇▃
eval/runtime,▄▂█▁▄
eval/samples_per_second,▅▇▁█▅
eval/steps_per_second,▅▇▁█▆
train/epoch,▁▂▃▅▅▆██
train/global_step,▁▂▃▄▅▆██
train/learning_rate,▁█
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.27372
eval/mse,0.27372
eval/runtime,8.5884
eval/samples_per_second,113.176
eval/steps_per_second,7.103
train/epoch,5.0
train/global_step,1420.0
train/learning_rate,4e-05
train/loss,0.2263
train/total_flos,5968612478376864.0


[34m[1mwandb[0m: Agent Starting Run: 1ej236kf with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	warmup_steps: 500
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss,Mse
1,No log,0.32525,0.32525
2,1.278200,0.245018,0.245018
3,1.278200,0.226023,0.226023
4,0.187300,0.267523,0.267523


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▂▁▄
eval/mse,█▂▁▄
eval/runtime,▃▁▂█
eval/samples_per_second,▆█▇▁
eval/steps_per_second,▆█▇▁
train/epoch,▁▃▃▆▇██
train/global_step,▁▃▃▆▇██
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁

0,1
eval/loss,0.26752
eval/mse,0.26752
eval/runtime,8.6436
eval/samples_per_second,112.453
eval/steps_per_second,7.057
train/epoch,4.0
train/global_step,1136.0
train/learning_rate,1e-05
train/loss,0.1873
train/total_flos,4774896560418816.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


Map:   0%|          | 0/4537 [00:00<?, ? examples/s]

Map:   0%|          | 0/972 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Create sweep with ID: ouo8swcb
Sweep URL: https://wandb.ai/langdon/ellipse/sweeps/ouo8swcb


[34m[1mwandb[0m: Agent Starting Run: 4fgapib4 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	warmup_steps: 100
[34m[1mwandb[0m: 	weight_decay: 0.1


Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
