# Model Training

In this notebook, we will train a collection of models using the same hyperparameter config.

In [1]:
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, logging)

from datasets import DatasetDict, Value

from sklearn.metrics import mean_squared_error

import pandas as pd

import wandb

%env WANDB_ENTITY = langdon
%env WANDB_PROJECT = ellipse
%env WANDB_DIR = /home/jovyan/active-projects/ellipse-methods-showcase/bin

logging.set_verbosity_error()

model_name='microsoft/deberta-v3-large'

env: WANDB_ENTITY=langdon
env: WANDB_PROJECT=ellipse
env: WANDB_DIR=/home/jovyan/active-projects/ellipse-methods-showcase/bin


## Load DatasetDict and Tokenize


In [2]:
# Initialize tokenizer and create helper function for tokenization as we did in the previous notebooks.
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

def tokenize_inputs(example):
    return tokenizer(example['text'], max_length=512, truncation=True)

In [3]:
def get_datadict():
    ''' Selects a target score that the model should predict and renames that score to 'label'.
    Removes other columns from the dataset. The other columns are not needed for training.
    '''
    
    # Load the DatasetDict object we created in the previous notebook. 
    # We will be removing the columns that we defined above, and renaming the target column (=score_to_predict) into 'label'
    dd = (DatasetDict
          .load_from_disk('../data/raw_ellipse.hf')
          .map(tokenize_inputs, remove_columns=['text']) # the transformer does not need these columns to train.
         )
    
    return dd

In [4]:
datadict = get_datadict()

Map:   0%|          | 0/6216 [00:00<?, ? examples/s]

Map:   0%|          | 0/1332 [00:00<?, ? examples/s]

Map:   0%|          | 0/1332 [00:00<?, ? examples/s]

In [5]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    mse = mean_squared_error(labels, preds)

    return {'mse': mse}

In [6]:
datadict

DatasetDict({
    train: Dataset({
        features: ['id', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6216
    })
    dev: Dataset({
        features: ['id', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1332
    })
    test: Dataset({
        features: ['id', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1332
    })

## Train Function


In [7]:
config = {
    'batch_size': 16,
    'learning_rate': 1e-5,
    'num_train_epochs': 4,
    'pooler_dropout': 0.30,
    'weight_decay': 0.01,
    'adam_beta1': 0.900,
    'adam_beta2': 0.999,
    # Microsoft recommended 1e-6 but I forgot to set this value during the HP search.
    # The HP search was based on the default epsilon of 1e-8 rather than the recommended 1e-6,
    # so we use 1e-8 here because other parameters were based on this value.
    'adam_epsilon': 1e-8, 
    'warmup_steps': 1000
}

In [8]:
scores_to_predict = [
    'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1',
    'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'
]

In [9]:
def train(score_to_predict):

    print(score_to_predict)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=1, pooler_dropout=config['pooler_dropout']
    )

    ignored_columns = list(set(scores_to_predict) - {score_to_predict}) + ['id']
    
    dd = (
        datadict
        .remove_columns(ignored_columns)
        .rename_column(score_to_predict, 'label')
        .cast_column('label', Value("float32"))
    )

    training_args = TrainingArguments(
        optim = 'adamw_torch',
        output_dir=f'..bin/{model_name}',
        logging_dir = f'../logs/{score_to_predict}',
        load_best_model_at_end = False,
        evaluation_strategy='no',
        save_strategy='no',
        greater_is_better = False,
        log_level = 'error',
        disable_tqdm = False,
        run_name=f'deberta-v3-large-{score_to_predict}',
        report_to='wandb',
        adam_beta1=config['adam_beta1'],
        adam_beta2=config['adam_beta2'],
        adam_epsilon=config['adam_epsilon'],
        num_train_epochs=config['num_train_epochs'],
        warmup_steps=config['warmup_steps'],
        learning_rate=config['learning_rate'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=16,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dd['train'],
        tokenizer=tokenizer,
    )

    trainer.train()

    predictions = trainer.predict(test_dataset=dd['test']).predictions

    results = pd.DataFrame({f'deberta_{score_to_predict}': predictions})
    results = results.set_index(pd.Index(datadict['test']['id']))
    results.to_csv(f'../results/deberta-v3-large/{score_to_predict}.csv')

    trainer.save_model(output_dir=f'../bin/deberta-v3-large-models/{score_to_predict}')
    
    return

## Train Loop


In [10]:
scores_to_predict = [
    'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1',
    'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'
]

In [None]:
for score_to_predict in scores_to_predict:
    train(score_to_predict)

overall_1


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mlangdon[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,3.0232
1000,0.3531
1500,0.303


cohesion_1


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Step,Training Loss
500,2.462
1000,0.4507
1500,0.3893


syntax_1


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Step,Training Loss
500,2.3022
1000,0.4338
1500,0.3551


vocabulary_1


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Step,Training Loss
500,2.515
1000,0.3744
1500,0.3209


phraseology_1


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Step,Training Loss
500,2.371
1000,0.4127
1500,0.3433


grammar_1


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Step,Training Loss
500,2.4058


phraseology_2


Casting the dataset:   0%|          | 0/6216 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1332 [00:00<?, ? examples/s]

Step,Training Loss
500,2.4297
1000,0.4229
