### Fine-Tune Language Models

- Joel Stremmel
- 04-24-23

##### About

Fine-Tune pretrained language models on the formatted data using K-Fold Cross-Validation and save the scores.

##### Set Parameters
Pick a model size and provide a list of models and parameters to train within that size.

In [1]:
size = "small"
params = {
    "env": {"colab": False, "require_high_ram": True},
    "data": {"add_summaries": False},
    "training": {
        "lr": 0.000005,
        "weight_decay": 0.01,
        "adam_beta1": 0.9,
        "adam_beta2": 0.999,
        "adam_epsilon": 0.00000001,
        "warmup_steps": 50,
        "num_workers": 2,
        "epochs": 500,
        "early_stopping_patience": 10,
        "logging_strategy": "epoch"
    },
    "evaluation": {
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "save_total_limit": 1,
        "fp16_full_eval": False,
        "eval_accumulation_steps": 100,
    },
    "small_models": {
        'roberta_base': {
            'path': 'roberta-base',
            'fp16': True,
            'max_seq_len': 512,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': True,
            'type': 'mlm'
        },
    },
    "large_models": {
        "roberta_large": {
            "path": "roberta-large",
            "fp16": True,
            "max_seq_len": 512,
            "batch_size": 16,
            "accumulation_steps": 1,
            "gradient_checkpointing": False,
            "type": "mlm",
        },
    },
    "io": {
        "results_dir": "./results",
        "input_dir": "./data",
        "model_output_dir": "./model_output",
    },
    "augmentation": {
        "add_synthetic": False,
        "aug_p": 0.2,
        "glove_file": "data/glove.6B.50d.txt",
        "glove_zip": "data/glove.6B.zip",
        "glove_url": "http://nlp.stanford.edu/data/glove.6B.zip",
    },
    "random": {"seed": 42},
}

#         'db_base': {
#             'path': 'distilbert-base-uncased',
#             'fp16': True,
#             'max_seq_len': 512,
#             'batch_size': 4,
#             'accumulation_steps': 4,
#             'gradient_checkpointing': False,
#             'type': 'mlm'
#         },

        # 'gpt2': {
        #     'path': 'gpt2',
        #     'fp16': False,
        #     'max_seq_len': 1024,
        #     'batch_size': 1,
        #     'accumulation_steps': 16,
        #     'gradient_checkpointing': True,
        #     'type': 'gpt'
        # },
        # 'roberta_base': {
        #     'path': 'roberta-base',
        #     'fp16': True,
        #     'max_seq_len': 512,
        #     'batch_size': 16,
        #     'accumulation_steps': 1,
        #     'gradient_checkpointing': False,
        #     'type': 'mlm'
        # },
        # 'lf_mini': {
        #     'path': 'kiddothe2b/longformer-mini-1024',
        #     'max_seq_len': 1024,
        #     'fp16': True,
        #     'batch_size': 1,
        #     'accumulation_steps': 16,
        #     'gradient_checkpointing': False,
        #     'type': 'mlm'
        # },
        # "flan_t5_small": {
        #     "path": "google/flan-t5-small",
        #     "max_seq_len": 1024,
        #     "output_max_seq_len": 5,
        #     "fp16": False,
        #     "batch_size": 1,
        #     "accumulation_steps": 16,
        #     "gradient_checkpointing": False,
        #     "type": "seq2seq",
        # }

#         "lf_base": {
#             "path": "allenai/longformer-base-4096",
#             "max_seq_len": 4096,
#             "fp16": True,
#             "batch_size": 4,
#             "accumulation_steps": 4,
#             "gradient_checkpointing": False,
#             "type": "mlm",
#         },
#         "bb_base": {
#             "max_seq_len": 4096,
#             "fp16": True,
#             "batch_size": 4,
#             "accumulation_steps": 4,
#             "gradient_checkpointing": False,
#             "type": "mlm",
#         },
#         "bb_large": {
#           "path": "google/bigbird-roberta-large",
#           "max_seq_len": 4096,
#           "fp16": True,
#           "batch_size": 16,
#           "accumulation_steps": 1,
#           "gradient_checkpointing": True,
#           "type": "mlm",
#         },
#         "gpt_neo_1_3b": {
#             "path": "EleutherAI/gpt-neo-1.3B",
#             "max_seq_len": 2048,
#             "fp16": False,
#             "batch_size": 1,
#             "accumulation_steps": 16,
#             "gradient_checkpointing": False,
#             "type": "gpt",
#         },
#         "flan_t5_large": {
#             "path": "google/flan-t5-large",
#             "max_seq_len": 1024,
#             "output_max_seq_len": 6,
#             "fp16": False,
#             "batch_size": 1,
#             "accumulation_steps": 16,
#             "gradient_checkpointing": False,
#             "type": "seq2seq",
#         },

##### Mount Google Drive, Install Requirements, and set Cache if Using Colab

In [2]:
if params["env"]["colab"]:

    import os
    from google.colab import drive

    # Mount
    drive.mount("/content/drive")

    # Install packages
    !pip install -q -r "/content/drive/MyDrive/nlp4psychotherapy/requirements.txt"

    # Set HF cache
    os.environ['TRANSFORMERS_CACHE'] = '/content/drive/MyDrive/hf_cache'
    os.environ['HF_DATASETS_CACHE'] = '/content/drive/MyDrive/hf_cache'

##### Check Colab Runtime

In [3]:
if params["env"]["colab"]:
  
    gpu_info = !nvidia-smi
    gpu_info = "\n".join(gpu_info)
    if gpu_info.find("failed") >= 0:
        print("Not connected to a GPU")
    else:
        print(gpu_info)

if params["env"]["require_high_ram"]:

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print("Your runtime has {:.1f} gigabytes of available RAM\n".format(ram_gb))

    if ram_gb < 20:
        print("Not using a high-RAM runtime")
    else:
        print("You are using a high-RAM runtime!")

Your runtime has 33.6 gigabytes of available RAM

You are using a high-RAM runtime!


##### Imports

In [4]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    GPT2ForSequenceClassification,
    GPTNeoForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [6]:
if params["data"]["add_summaries"]:
    with open(os.path.join(params["io"]["input_dir"], "Xwsum_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

else:
    with open(os.path.join(params["io"]["input_dir"], "X_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

    with open(os.path.join(params["io"]["input_dir"], "y_folds.pkl"), "rb") as f:
        y_folds = pickle.load(f)

##### Check Data Shape

In [7]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [8]:
print(f"Target prevalance: {round(np.mean(np.concatenate(y)), 3)}.")

Target prevalance: 0.492.


##### Check that GPU is Available

In [9]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Select and Preprocess Text and Fit Model to Each Data Fold

In [10]:
y_probs, y_trues = {}, {}
for model in params[f"{size}_models"].keys():

    y_probs[model], y_trues[model] = [], []
    for i in range(len(X)):
        
        # Print model and fold
        print(f"Fitting model: {model} using fold {i} as out of fold test data.")

        # Identify train and test folds
        X_train_temp, y_train_temp = X[0:i] + X[i + 1 :], y[0:i] + y[i + 1 :]
        X_test, y_test = X[i], y[i]

        # Select a validation fold at random
        indices_temp = np.arange(len(y_train_temp))
        val_index = np.random.choice(indices_temp)
        X_val, y_val = X_train_temp[val_index], y_train_temp[val_index]

        # Identify the training folds as the indices not including the validation index
        # Concatenate all examples in the training folds to form the full training set
        X_train = np.concatenate(np.delete(X_train_temp, val_index), axis=0)
        y_train = np.concatenate(np.delete(y_train_temp, val_index), axis=0)

        # Shuffle training data
        indices = np.arange(len(y_train))
        np.random.shuffle(indices)
        X_train, y_train = X_train[indices], y_train[indices]

        # Print data shapes
        print(f"Train data sizes: {len(X_train), len(y_train)}.")
        print(f"Val data sizes: {len(X_val), len(y_val)}.")
        print(f"Test data sizes: {len(X_test), len(y_test)}.")

        # Format text and label data as HuggingFace dataset
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            train_dataset = Dataset.from_dict(
                {"text": X_train, "label_ids": [str(label) for label in y_train]}
            )
            val_dataset = Dataset.from_dict(
                {"text": X_val, "label_ids": [str(label) for label in y_val]}
            )
            test_dataset = Dataset.from_dict(
                {"text": X_test, "label_ids": [str(label) for label in y_test]}
            )

        else:
            train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
            val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})
            test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            params[f"{size}_models"][model]["path"]
        )

        # Load model by model type
        if params[f"{size}_models"][model]["type"] == "mlm":
            
            # Load masked language model with a sequence classification head
            lm = AutoModelForSequenceClassification.from_pretrained(
                params[f"{size}_models"][model]["path"],
                num_labels=2,
                return_dict=True,
                problem_type="single_label_classification",
            )

        elif params[f"{size}_models"][model]["type"] == "gpt":
            
            # Use the end of sentence token as a pad token for GPT models
            tokenizer.pad_token = tokenizer.eos_token

            if model == "gpt2":
                
                # Load GPT-2
                lm = GPT2ForSequenceClassification.from_pretrained(
                    params[f"{size}_models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            elif "gpt_neo" in model:
                
                # Load a GPT Neo version
                lm = GPTNeoForSequenceClassification.from_pretrained(
                    params[f"{size}_models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            else:
                raise ValueError("Expected GPT model to be gpt2 or a gpt_neo version.")

        elif params[f"{size}_models"][model]["type"] == "seq2seq":
            lm = AutoModelForSeq2SeqLM.from_pretrained(
                params[f"{size}_models"][model]["path"]
            )
        else:
            raise ValueError(
                f"Unexpected model type: {params[f'{size}_models'][model]['path']}."
            )

        # Define function to preprocess and tokenize text
        if params[f"{size}_models"][model]["type"] == "seq2seq":

            def preprocess_function(
                sample, padding="max_length", output_max_seq_len=20
            ):
                
                # Add prefix to the input for t5
                inputs = [
                    "Classify this text as either 1 or 0: " + item
                    for item in sample["text"]
                ]

                # tokenize inputs
                model_inputs = tokenizer(
                    inputs,
                    max_length=params[f"{size}_models"][model]["max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # Tokenize targets with the `text_target` keyword argument
                labels = tokenizer(
                    text_target=sample["label_ids"],
                    max_length=params[f"{size}_models"][model]["output_max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
                # padding in the loss.
                if padding == "max_length":
                    labels["input_ids"] = [
                        [(l if l != tokenizer.pad_token_id else -100) for l in label]
                        for label in labels["input_ids"]
                    ]

                model_inputs["label_ids"] = labels["input_ids"]

                return model_inputs

        else:

            def preprocess_function(batch):
                return tokenizer(
                    batch["text"],
                    padding="max_length",
                    truncation=True,
                    max_length=params[f"{size}_models"][model]["max_seq_len"],
                )

        # Preprocess datasets
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params[f"{size}_models"][model]["batch_size"],
        )
        train_dataset.set_format("pt")
        val_dataset = val_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params[f"{size}_models"][model]["batch_size"],
        )
        val_dataset.set_format("pt")
        test_dataset = test_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params[f"{size}_models"][model]["batch_size"],
        )
        test_dataset.set_format("pt")

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=params["io"]["model_output_dir"],
            num_train_epochs=params["training"]["epochs"],
            warmup_steps=params["training"]["warmup_steps"],
            weight_decay=params["training"]["weight_decay"],
            learning_rate=params["training"]["lr"],
            adam_beta1=params["training"]["adam_beta1"],
            adam_beta2=params["training"]["adam_beta2"],
            adam_epsilon=params["training"]["adam_epsilon"],
            dataloader_num_workers=params["training"]["num_workers"],
            logging_strategy=params["training"]["logging_strategy"],
            seed=params["random"]["seed"],
            run_name=params[f"{size}_models"][model],
            fp16=params[f"{size}_models"][model]["fp16"],
            gradient_checkpointing=params[f"{size}_models"][model][
                "gradient_checkpointing"
            ],
            per_device_train_batch_size=params[f"{size}_models"][model]["batch_size"],
            per_device_eval_batch_size=params[f"{size}_models"][model]["batch_size"],
            gradient_accumulation_steps=params[f"{size}_models"][model][
                "accumulation_steps"
            ],
            evaluation_strategy=params["evaluation"]["evaluation_strategy"],
            save_strategy=params["evaluation"]["save_strategy"],
            fp16_full_eval=params["evaluation"]["fp16_full_eval"],
            eval_accumulation_steps=params["evaluation"]["eval_accumulation_steps"],
            save_total_limit=params["evaluation"]["save_total_limit"],
            lr_scheduler_type="linear",
            optim="adamw_torch",
            sharded_ddp=False,
            prediction_loss_only=False,
            load_best_model_at_end=True,
            disable_tqdm=True,
            logging_dir=None,
        )
        
        # Define special training arguments
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            training_args.generation_max_length = params[f"{size}_models"][model]["output_max_seq_len"]
            training_args.predict_with_generate = True
            training_args.generation_num_beams = None

        # Define early stopping callback
        early_stopping = EarlyStoppingCallback(
            early_stopping_patience=params["training"]["early_stopping_patience"]
        )

        # Define trainer
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            trainer = Seq2SeqTrainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                callbacks=[early_stopping],
            )
        else:
            trainer = Trainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                callbacks=[early_stopping],
            )

        # Train model
        trainer.train()

        # Predict on test dataset for seq2seq models
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            
            # Predict on test dataset with greedy generation
            output = trainer.predict(
                test_dataset,
                do_sample=False,
                max_length=params[f"{size}_models"][model]["output_max_seq_len"],
                early_stopping=True,
            )
            preds_decoded = tokenizer.batch_decode(
                output.predictions, skip_special_tokens=True
            )
            labels = np.where(
                output.label_ids != -100, output.label_ids, tokenizer.pad_token_id
            )
            labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=False)

            # Convert preds to ints
            # We allow additional characters to be generated by check
            # that the first one is a 1 or 0
            preds = []
            for pred in preds_decoded:
                if pred[0] == "1":
                    preds.append(1)
                elif pred[0] == "0":
                    preds.append(0)
                else:
                    print(f"Got unexpected pred: {pred}.")
                    preds.append(np.random.choice([0, 1]))

            # Save scores and labels
            # The labels may contain additional characters, but the first should be
            # a 1 or 0
            y_probs[model].append(preds)
            y_trues[model].append([int(label[0]) for label in labels_decoded])

        # Predict on test set for other model types
        else:
            # Generate scores
            output = trainer.predict(test_dataset)
            labels = output.label_ids
            y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[
                :, 1
            ]

            # Save scores and labels
            y_probs[model].append(y_prob)
            y_trues[model].append(labels)

        # Empty cuda cache
        torch.cuda.empty_cache()

# Save results
with open(os.path.join(params["io"]["results_dir"], "lm_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(params["io"]["results_dir"], "lm_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)

Fitting model: roberta_base using fold 0 as out of fold test data.
Train data sizes: (35, 35).
Val data sizes: (12, 12).
Test data sizes: (12, 12).


  arr = asarray(arr)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

{'loss': 0.7743, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.91}
{'eval_loss': 0.6817099452018738, 'eval_runtime': 2.0077, 'eval_samples_per_second': 5.977, 'eval_steps_per_second': 5.977, 'epoch': 0.91}
{'loss': 0.7544, 'learning_rate': 4.0000000000000003e-07, 'epoch': 1.83}
{'eval_loss': 0.6817140579223633, 'eval_runtime': 2.0114, 'eval_samples_per_second': 5.966, 'eval_steps_per_second': 5.966, 'epoch': 1.83}
{'loss': 0.7639, 'learning_rate': 6.000000000000001e-07, 'epoch': 2.74}
{'eval_loss': 0.6817214488983154, 'eval_runtime': 2.0126, 'eval_samples_per_second': 5.962, 'eval_steps_per_second': 5.962, 'epoch': 2.74}
{'loss': 0.7709, 'learning_rate': 8.000000000000001e-07, 'epoch': 3.66}
{'eval_loss': 0.6817379593849182, 'eval_runtime': 2.0173, 'eval_samples_per_second': 5.949, 'eval_steps_per_second': 5.949, 'epoch': 3.66}
{'loss': 0.7662, 'learning_rate': 1.0000000000000002e-06, 'epoch': 4.57}
{'eval_loss': 0.6817670464515686, 'eval_runtime': 2.0176, 'eval_samples_per_secon

  arr = asarray(arr)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.8104, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.86}
{'eval_loss': 0.7031537294387817, 'eval_runtime': 1.5306, 'eval_samples_per_second': 5.88, 'eval_steps_per_second': 5.88, 'epoch': 0.86}
{'loss': 0.8038, 'learning_rate': 4.0000000000000003e-07, 'epoch': 1.73}
{'eval_loss': 0.7031249403953552, 'eval_runtime': 1.5348, 'eval_samples_per_second': 5.864, 'eval_steps_per_second': 5.864, 'epoch': 1.73}
{'loss': 0.7929, 'learning_rate': 6.000000000000001e-07, 'epoch': 2.59}
{'eval_loss': 0.7031204700469971, 'eval_runtime': 1.5318, 'eval_samples_per_second': 5.876, 'eval_steps_per_second': 5.876, 'epoch': 2.59}
{'loss': 0.5379, 'learning_rate': 9.000000000000001e-07, 'epoch': 3.89}
{'eval_loss': 0.7030941247940063, 'eval_runtime': 1.5332, 'eval_samples_per_second': 5.87, 'eval_steps_per_second': 5.87, 'epoch': 3.89}
{'loss': 0.7923, 'learning_rate': 1.1e-06, 'epoch': 4.76}
{'eval_loss': 0.7031098008155823, 'eval_runtime': 1.5362, 'eval_samples_per_second': 5.859, 'eval_st

  arr = asarray(arr)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

{'loss': 0.8199, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.84}
{'eval_loss': 0.6897609233856201, 'eval_runtime': 1.531, 'eval_samples_per_second': 5.878, 'eval_steps_per_second': 5.878, 'epoch': 0.84}
{'loss': 0.8396, 'learning_rate': 4.0000000000000003e-07, 'epoch': 1.68}
{'eval_loss': 0.6897531151771545, 'eval_runtime': 1.5328, 'eval_samples_per_second': 5.872, 'eval_steps_per_second': 5.872, 'epoch': 1.68}
{'loss': 0.5424, 'learning_rate': 7.000000000000001e-07, 'epoch': 2.95}
{'eval_loss': 0.6897610425949097, 'eval_runtime': 1.5349, 'eval_samples_per_second': 5.863, 'eval_steps_per_second': 5.863, 'epoch': 2.95}
{'loss': 0.8238, 'learning_rate': 9.000000000000001e-07, 'epoch': 3.79}
{'eval_loss': 0.6897469758987427, 'eval_runtime': 1.5362, 'eval_samples_per_second': 5.859, 'eval_steps_per_second': 5.859, 'epoch': 3.79}
{'loss': 0.8362, 'learning_rate': 1.1e-06, 'epoch': 4.63}
{'eval_loss': 0.6897364854812622, 'eval_runtime': 1.5322, 'eval_samples_per_second': 5.874, 'eval

  arr = asarray(arr)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

{'loss': 0.8455, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.84}
{'eval_loss': 0.7106687426567078, 'eval_runtime': 2.0254, 'eval_samples_per_second': 5.925, 'eval_steps_per_second': 5.925, 'epoch': 0.84}
{'loss': 0.8346, 'learning_rate': 4.0000000000000003e-07, 'epoch': 1.68}
{'eval_loss': 0.7106759548187256, 'eval_runtime': 2.0273, 'eval_samples_per_second': 5.919, 'eval_steps_per_second': 5.919, 'epoch': 1.68}
{'loss': 0.5472, 'learning_rate': 7.000000000000001e-07, 'epoch': 2.95}
{'eval_loss': 0.710677444934845, 'eval_runtime': 2.0279, 'eval_samples_per_second': 5.917, 'eval_steps_per_second': 5.917, 'epoch': 2.95}
{'loss': 0.8117, 'learning_rate': 9.000000000000001e-07, 'epoch': 3.79}
{'eval_loss': 0.7106509804725647, 'eval_runtime': 2.0245, 'eval_samples_per_second': 5.927, 'eval_steps_per_second': 5.927, 'epoch': 3.79}
{'loss': 0.8252, 'learning_rate': 1.1e-06, 'epoch': 4.63}
{'eval_loss': 0.7106249928474426, 'eval_runtime': 2.025, 'eval_samples_per_second': 5.926, 'eval_

  arr = asarray(arr)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.7241, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.94}
{'eval_loss': 0.694706916809082, 'eval_runtime': 2.0212, 'eval_samples_per_second': 5.937, 'eval_steps_per_second': 5.937, 'epoch': 0.94}
{'loss': 0.7394, 'learning_rate': 4.0000000000000003e-07, 'epoch': 1.88}
{'eval_loss': 0.6946821212768555, 'eval_runtime': 2.0205, 'eval_samples_per_second': 5.939, 'eval_steps_per_second': 5.939, 'epoch': 1.88}
{'loss': 0.7208, 'learning_rate': 6.000000000000001e-07, 'epoch': 2.82}
{'eval_loss': 0.6946447491645813, 'eval_runtime': 2.0234, 'eval_samples_per_second': 5.931, 'eval_steps_per_second': 5.931, 'epoch': 2.82}
{'loss': 0.7433, 'learning_rate': 8.000000000000001e-07, 'epoch': 3.76}
{'eval_loss': 0.6945950984954834, 'eval_runtime': 2.0272, 'eval_samples_per_second': 5.919, 'eval_steps_per_second': 5.919, 'epoch': 3.76}
{'loss': 0.7442, 'learning_rate': 1.0000000000000002e-06, 'epoch': 4.71}
{'eval_loss': 0.694478452205658, 'eval_runtime': 2.0424, 'eval_samples_per_second'

{'loss': 0.4456, 'learning_rate': 4.815789473684211e-06, 'epoch': 40.0}
{'eval_loss': 0.6901324391365051, 'eval_runtime': 2.0235, 'eval_samples_per_second': 5.93, 'eval_steps_per_second': 5.93, 'epoch': 40.0}
{'loss': 0.6316, 'learning_rate': 4.8052631578947375e-06, 'epoch': 40.94}
{'eval_loss': 0.6930401921272278, 'eval_runtime': 2.0551, 'eval_samples_per_second': 5.839, 'eval_steps_per_second': 5.839, 'epoch': 40.94}
{'loss': 0.6328, 'learning_rate': 4.794736842105264e-06, 'epoch': 41.88}
{'eval_loss': 0.7011608481407166, 'eval_runtime': 2.034, 'eval_samples_per_second': 5.9, 'eval_steps_per_second': 5.9, 'epoch': 41.88}
{'loss': 0.5856, 'learning_rate': 4.78421052631579e-06, 'epoch': 42.82}
{'eval_loss': 0.7151849269866943, 'eval_runtime': 2.0403, 'eval_samples_per_second': 5.881, 'eval_steps_per_second': 5.881, 'epoch': 42.82}
{'loss': 0.589, 'learning_rate': 4.773684210526316e-06, 'epoch': 43.76}
{'eval_loss': 0.7217443585395813, 'eval_runtime': 2.0437, 'eval_samples_per_second': 

##### Unassign Runtime if Running on Colab

In [11]:
if params["env"]["colab"]:

    from google.colab import runtime
    runtime.unassign()