### Fine-Tune Language Models

- Joel Stremmel
- 01-11-24

##### About

Fine-Tune pretrained language models on the formatted data using K-Fold Cross-Validation and save the scores.

##### Set Parameters
Pick a model size and provide a list of models and parameters to train within that size.

In [1]:
# used 20 warmup steps for 5 and 10 epoch models and 10 for the 3 epoch models
outcome = 'Alliance' # 'cohesion' # 'Alliance'
params = {
    "env": {"colab": False, "require_high_ram": True},
    "data": {"add_summaries": False},
    "training": {
        "lr": 0.000005,
        "weight_decay": 0.01,
        "adam_beta1": 0.9,
        "adam_beta2": 0.999,
        "adam_epsilon": 0.00000001,
        "warmup_steps": 20,
        "num_workers": 2,
        "epochs": 5,
        "logging_strategy": "epoch"
    },
    "evaluation": {
        "save_strategy": "epoch",
        "save_total_limit": 1,
    },
    "models": {
        "mental_roberta_base": {
            "path": "./models/mental-roberta-base",
            "max_seq_len": 512,
            "fp16": True,
            "batch_size": 1,
            "accumulation_steps": 16,
            "gradient_checkpointing": True,
            "type": "mlm",
        },
        'roberta_base': {
            'path': 'roberta-base',
            'fp16': True,
            'max_seq_len': 512,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': True,
            'type': 'mlm'
        },
        'roberta_pysch': {
            'path': 'mlaricheva/roberta-psych',
            'fp16': True,
            'max_seq_len': 512,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': True,
            'type': 'mlm'
        },
    },
    "io": {
        "results_dir": "./results_fixed_epochs_no_val_5",
        "input_dir": "./data",
        "model_output_dir": "./model_output",
    },
    "augmentation": {
        "add_synthetic": False,
        "aug_p": 0.2,
        "glove_file": "data/glove.6B.50d.txt",
        "glove_zip": "data/glove.6B.zip",
        "glove_url": "http://nlp.stanford.edu/data/glove.6B.zip",
    },
    "random": {"seed": 42},
}

In [2]:
# # Could use PEFT to save memory

# from peft import LoraConfig, get_peft_model 

# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=["q_lin", "v_lin"],
    
# )

# model = get_peft_model(model, config)

##### Mount Google Drive, Install Requirements, and set Cache if Using Colab

In [3]:
if params["env"]["colab"]:

    import os
    from google.colab import drive

    # Mount
    drive.mount("/content/drive")

    # Install packages
    !pip install -q -r "/content/drive/MyDrive/nlp4psychotherapy/requirements.txt"

    # Set HF cache
    os.environ['TRANSFORMERS_CACHE'] = '/content/drive/MyDrive/hf_cache'
    os.environ['HF_DATASETS_CACHE'] = '/content/drive/MyDrive/hf_cache'

##### Check Colab Runtime

In [4]:
if params["env"]["colab"]:
  
    gpu_info = !nvidia-smi
    gpu_info = "\n".join(gpu_info)
    if gpu_info.find("failed") >= 0:
        print("Not connected to a GPU")
    else:
        print(gpu_info)

if params["env"]["require_high_ram"]:

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print("Your runtime has {:.1f} gigabytes of available RAM\n".format(ram_gb))

    if ram_gb < 20:
        print("Not using a high-RAM runtime")
    else:
        print("You are using a high-RAM runtime!")

Your runtime has 33.6 gigabytes of available RAM

You are using a high-RAM runtime!


##### Imports

In [5]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    GPT2ForSequenceClassification,
    GPTNeoForSequenceClassification,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [7]:
if params["data"]["add_summaries"]:
    with open(os.path.join(params["io"]["input_dir"], f"{outcome}_Xwsum_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

else:
    with open(os.path.join(params["io"]["input_dir"], f"{outcome}_X_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

    with open(os.path.join(params["io"]["input_dir"], f"{outcome}_y_folds.pkl"), "rb") as f:
        y_folds = pickle.load(f)

##### Check Data Shape

In [8]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [9]:
print(f"Target prevalance: {round(np.mean(np.concatenate(y)), 3)}.")

Target prevalance: 0.593.


##### Check that GPU is Available

In [10]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Select and Preprocess Text and Fit Model to Each Data Fold

In [11]:
y_probs, y_trues = {}, {}
for model in params["models"].keys():

    y_probs[model], y_trues[model] = [], []
    for i in range(len(X)):
        
        # Print model and fold
        print(f"Fitting model: {model} using fold {i} as out of fold test data.")

        # Identify train and test folds
        X_train_temp, y_train_temp = X[0:i] + X[i + 1 :], y[0:i] + y[i + 1 :]
        X_test, y_test = X[i], y[i]

        X_train = np.concatenate(X_train_temp, axis=0)
        y_train = np.concatenate(y_train_temp, axis=0)

        # Shuffle training data
        indices = np.arange(len(y_train))
        np.random.shuffle(indices)
        X_train, y_train = X_train[indices], y_train[indices]

        # Print data shapes
        print(f"Train data sizes: {len(X_train), len(y_train)}.")
        print(f"Test data sizes: {len(X_test), len(y_test)}.")

        # Format text and label data as HuggingFace dataset
        if params["models"][model]["type"] == "seq2seq":
            train_dataset = Dataset.from_dict(
                {"text": X_train, "label_ids": [str(label) for label in y_train]}
            )
            test_dataset = Dataset.from_dict(
                {"text": X_test, "label_ids": [str(label) for label in y_test]}
            )

        else:
            train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
            test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            params["models"][model]["path"]
        )

        # Load model by model type
        if params["models"][model]["type"] == "mlm":
            
            # Load masked language model with a sequence classification head
            lm = AutoModelForSequenceClassification.from_pretrained(
                params["models"][model]["path"],
                num_labels=2,
                return_dict=True,
                problem_type="single_label_classification"
            )

        elif params["models"][model]["type"] == "gpt":
            
            # Use the end of sentence token as a pad token for GPT models
            tokenizer.pad_token = tokenizer.eos_token

            if model == "gpt2":
                
                # Load GPT-2
                lm = GPT2ForSequenceClassification.from_pretrained(
                    params["models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            elif "gpt_neo" in model:
                
                # Load a GPT Neo version
                lm = GPTNeoForSequenceClassification.from_pretrained(
                    params["models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            else:
                raise ValueError("Expected GPT model to be gpt2 or a gpt_neo version.")

        elif params["models"][model]["type"] == "seq2seq":
            lm = AutoModelForSeq2SeqLM.from_pretrained(
                params["models"][model]["path"]
            )
        elif params["models"][model]["type"] == "causal":
            lm = AutoModelForCausalLM.from_pretrained(
                params["models"][model]["path"]
            )
        else:
            raise ValueError(
                f"Unexpected model type: {params[f'{size}_models'][model]['path']}."
            )

        # Define function to preprocess and tokenize text
        if params["models"][model]["type"] == "seq2seq":

            def preprocess_function(
                sample, padding="max_length", output_max_seq_len=20
            ):
                
                # Add prefix to the input for t5
                inputs = [
                    "Classify this text as either 1 or 0: " + item
                    for item in sample["text"]
                ]

                # tokenize inputs
                model_inputs = tokenizer(
                    inputs,
                    max_length=params["models"][model]["max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # Tokenize targets with the `text_target` keyword argument
                labels = tokenizer(
                    text_target=sample["label_ids"],
                    max_length=params["models"][model]["output_max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
                # padding in the loss.
                if padding == "max_length":
                    labels["input_ids"] = [
                        [(l if l != tokenizer.pad_token_id else -100) for l in label]
                        for label in labels["input_ids"]
                    ]

                model_inputs["label_ids"] = labels["input_ids"]

                return model_inputs

        else:

            def preprocess_function(batch):
                return tokenizer(
                    batch["text"],
                    padding="max_length",
                    truncation=True,
                    max_length=params["models"][model]["max_seq_len"],
                )

        # Preprocess datasets
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params["models"][model]["batch_size"],
        )
        train_dataset.set_format("pt")
        test_dataset = test_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params["models"][model]["batch_size"],
        )
        test_dataset.set_format("pt")

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=params["io"]["model_output_dir"],
            num_train_epochs=params["training"]["epochs"],
            warmup_steps=params["training"]["warmup_steps"],
            weight_decay=params["training"]["weight_decay"],
            learning_rate=params["training"]["lr"],
            adam_beta1=params["training"]["adam_beta1"],
            adam_beta2=params["training"]["adam_beta2"],
            adam_epsilon=params["training"]["adam_epsilon"],
            dataloader_num_workers=params["training"]["num_workers"],
            logging_strategy=params["training"]["logging_strategy"],
            seed=params["random"]["seed"],
            run_name=params["models"][model],
            fp16=params["models"][model]["fp16"],
            gradient_checkpointing=params["models"][model][
                "gradient_checkpointing"
            ],
            per_device_train_batch_size=params["models"][model]["batch_size"],
            gradient_accumulation_steps=params["models"][model][
                "accumulation_steps"
            ],
            evaluation_strategy='no',
            save_strategy=params["evaluation"]["save_strategy"],
            save_total_limit=params["evaluation"]["save_total_limit"],
            lr_scheduler_type="linear",
            optim="adamw_torch",
            prediction_loss_only=False,
            load_best_model_at_end=False,
            disable_tqdm=True,
            logging_dir=None,
        )
        
        # Define special training arguments
        if params["models"][model]["type"] == "seq2seq":
            training_args.generation_max_length = params["models"][model]["output_max_seq_len"]
            training_args.predict_with_generate = True
            training_args.generation_num_beams = None

        # Define trainer
        if params["models"][model]["type"] == "seq2seq":
            trainer = Seq2SeqTrainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                callbacks=[],
            )
        else:
            trainer = Trainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                callbacks=[],
            )

        # Train model
        trainer.train()

        # Predict on test dataset for seq2seq models
        if params["models"][model]["type"] == "seq2seq":
            
            # Predict on test dataset with greedy generation
            output = trainer.predict(
                test_dataset,
                do_sample=False,
                max_length=params["models"][model]["output_max_seq_len"],
                early_stopping=True,
            )
            preds_decoded = tokenizer.batch_decode(
                output.predictions, skip_special_tokens=True
            )
            labels = np.where(
                output.label_ids != -100, output.label_ids, tokenizer.pad_token_id
            )
            labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=False)

            # Convert preds to ints
            # We allow additional characters to be generated by check
            # that the first one is a 1 or 0
            preds = []
            for pred in preds_decoded:
                if pred[0] == "1":
                    preds.append(1)
                elif pred[0] == "0":
                    preds.append(0)
                else:
                    print(f"Got unexpected pred: {pred}.")
                    preds.append(np.random.choice([0, 1]))

            # Save scores and labels
            # The labels may contain additional characters, but the first should be
            # a 1 or 0
            y_probs[model].append(preds)
            y_trues[model].append([int(label[0]) for label in labels_decoded])

        # Predict on test set for other model types
        else:
            # Generate scores
            output = trainer.predict(test_dataset)
            labels = output.label_ids
            y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[
                :, 1
            ]

            # Save scores and labels
            y_probs[model].append(y_prob)
            y_trues[model].append(labels)

        # Empty cuda cache
        torch.cuda.empty_cache()
        
if params['data']['add_summaries']:
    sums = 'sum_'
else:
    sums = ''

# Save results
with open(os.path.join(params["io"]["results_dir"], f"{outcome}_{sums}lm_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(params["io"]["results_dir"], f"{outcome}_{sums}lm_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)

Fitting model: mental_roberta_base using fold 0 as out of fold test data.
Train data sizes: (99, 99).
Test data sizes: (9, 9).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 99
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7175, 'learning_rate': 1.5e-06, 'epoch': 0.97}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7105, 'learning_rate': 3e-06, 'epoch': 1.97}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7122, 'learning_rate': 4.5e-06, 'epoch': 2.97}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6997, 'learning_rate': 3e-06, 'epoch': 3.97}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6837, 'learning_rate': 0.0, 'epoch': 4.97}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 358.1893, 'train_samples_per_second': 1.382, 'train_steps_per_second': 0.084, 'train_loss': 0.7047098954518636, 'epoch': 4.97}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 1 as out of fold test data.
Train data sizes: (96, 96).
Test data sizes: (12, 12).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 96
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.6941, 'learning_rate': 1.5e-06, 'epoch': 1.0}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6907, 'learning_rate': 3e-06, 'epoch': 2.0}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7017, 'learning_rate': 4.5e-06, 'epoch': 3.0}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.68, 'learning_rate': 3e-06, 'epoch': 4.0}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6734, 'learning_rate': 5.000000000000001e-07, 'epoch': 5.0}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 12
  Batch size = 8


{'train_runtime': 349.6399, 'train_samples_per_second': 1.373, 'train_steps_per_second': 0.086, 'train_loss': 0.6879819711049397, 'epoch': 5.0}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 2 as out of fold test data.
Train data sizes: (94, 94).
Test data sizes: (14, 14).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 94
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 25
Saving model checkpoint to ./model_output/checkpoint-5
Configuration saved in ./model_output/checkpoint-5/config.json


{'loss': 0.8102, 'learning_rate': 1.25e-06, 'epoch': 0.85}


Model weights saved in ./model_output/checkpoint-5/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-10
Configuration saved in ./model_output/checkpoint-10/config.json


{'loss': 0.8074, 'learning_rate': 2.5e-06, 'epoch': 1.85}


Model weights saved in ./model_output/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-5] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-15
Configuration saved in ./model_output/checkpoint-15/config.json


{'loss': 0.8114, 'learning_rate': 3.7500000000000005e-06, 'epoch': 2.85}


Model weights saved in ./model_output/checkpoint-15/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-20
Configuration saved in ./model_output/checkpoint-20/config.json


{'loss': 0.7868, 'learning_rate': 5e-06, 'epoch': 3.85}


Model weights saved in ./model_output/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-15] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-25
Configuration saved in ./model_output/checkpoint-25/config.json


{'loss': 0.6804, 'learning_rate': 0.0, 'epoch': 4.85}


Model weights saved in ./model_output/checkpoint-25/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 14
  Batch size = 8


{'train_runtime': 300.0518, 'train_samples_per_second': 1.566, 'train_steps_per_second': 0.083, 'train_loss': 0.7792343425750733, 'epoch': 4.85}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 3 as out of fold test data.
Train data sizes: (99, 99).
Test data sizes: (9, 9).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 99
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7074, 'learning_rate': 1.5e-06, 'epoch': 0.97}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-25] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7175, 'learning_rate': 3e-06, 'epoch': 1.97}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7088, 'learning_rate': 4.5e-06, 'epoch': 2.97}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7077, 'learning_rate': 3e-06, 'epoch': 3.97}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.679, 'learning_rate': 0.0, 'epoch': 4.97}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 312.0606, 'train_samples_per_second': 1.586, 'train_steps_per_second': 0.096, 'train_loss': 0.7040801525115967, 'epoch': 4.97}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 4 as out of fold test data.
Train data sizes: (95, 95).
Test data sizes: (13, 13).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/95 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 95
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 25
Saving model checkpoint to ./model_output/checkpoint-5
Configuration saved in ./model_output/checkpoint-5/config.json


{'loss': 0.8082, 'learning_rate': 1.25e-06, 'epoch': 0.84}


Model weights saved in ./model_output/checkpoint-5/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-10
Configuration saved in ./model_output/checkpoint-10/config.json


{'loss': 0.8067, 'learning_rate': 2.5e-06, 'epoch': 1.84}


Model weights saved in ./model_output/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-5] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-15
Configuration saved in ./model_output/checkpoint-15/config.json


{'loss': 0.8181, 'learning_rate': 3.7500000000000005e-06, 'epoch': 2.84}


Model weights saved in ./model_output/checkpoint-15/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-20
Configuration saved in ./model_output/checkpoint-20/config.json


{'loss': 0.802, 'learning_rate': 5e-06, 'epoch': 3.84}


Model weights saved in ./model_output/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-15] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-25
Configuration saved in ./model_output/checkpoint-25/config.json


{'loss': 0.6743, 'learning_rate': 0.0, 'epoch': 4.84}


Model weights saved in ./model_output/checkpoint-25/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 291.418, 'train_samples_per_second': 1.63, 'train_steps_per_second': 0.086, 'train_loss': 0.7818522453308105, 'epoch': 4.84}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 5 as out of fold test data.
Train data sizes: (101, 101).
Test data sizes: (7, 7).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 101
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7207, 'learning_rate': 1.5e-06, 'epoch': 0.95}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-25] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7176, 'learning_rate': 3e-06, 'epoch': 1.95}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7126, 'learning_rate': 4.5e-06, 'epoch': 2.95}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7214, 'learning_rate': 3e-06, 'epoch': 3.95}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6682, 'learning_rate': 0.0, 'epoch': 4.95}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 316.3479, 'train_samples_per_second': 1.596, 'train_steps_per_second': 0.095, 'train_loss': 0.7081167221069335, 'epoch': 4.95}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 6 as out of fold test data.
Train data sizes: (96, 96).
Test data sizes: (12, 12).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 96
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.6892, 'learning_rate': 1.5e-06, 'epoch': 1.0}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6964, 'learning_rate': 3e-06, 'epoch': 2.0}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7001, 'learning_rate': 4.5e-06, 'epoch': 3.0}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6791, 'learning_rate': 3e-06, 'epoch': 4.0}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6801, 'learning_rate': 0.0, 'epoch': 5.0}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 12
  Batch size = 8


{'train_runtime': 304.2399, 'train_samples_per_second': 1.578, 'train_steps_per_second': 0.099, 'train_loss': 0.6889907519022623, 'epoch': 5.0}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 7 as out of fold test data.
Train data sizes: (98, 98).
Test data sizes: (10, 10).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 98
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7079, 'learning_rate': 1.5e-06, 'epoch': 0.98}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6992, 'learning_rate': 3e-06, 'epoch': 1.98}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7076, 'learning_rate': 4.5e-06, 'epoch': 2.98}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7043, 'learning_rate': 3e-06, 'epoch': 3.98}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6833, 'learning_rate': 0.0, 'epoch': 4.98}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 309.8922, 'train_samples_per_second': 1.581, 'train_steps_per_second': 0.097, 'train_loss': 0.7004625002543131, 'epoch': 4.98}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 8 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (4, 4).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.756, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7595, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7576, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.746, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6732, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 4
  Batch size = 8


{'train_runtime': 324.3814, 'train_samples_per_second': 1.603, 'train_steps_per_second': 0.092, 'train_loss': 0.73844788869222, 'epoch': 4.92}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 9 as out of fold test data.
Train data sizes: (97, 97).
Test data sizes: (11, 11).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 97
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7027, 'learning_rate': 1.5e-06, 'epoch': 0.99}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6936, 'learning_rate': 3e-06, 'epoch': 1.99}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6834, 'learning_rate': 4.5e-06, 'epoch': 2.99}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.694, 'learning_rate': 3e-06, 'epoch': 3.99}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6713, 'learning_rate': 0.0, 'epoch': 4.99}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 306.834, 'train_samples_per_second': 1.581, 'train_steps_per_second': 0.098, 'train_loss': 0.6890042304992676, 'epoch': 4.99}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 10 as out of fold test data.
Train data sizes: (101, 101).
Test data sizes: (7, 7).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifi

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 101
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7218, 'learning_rate': 1.5e-06, 'epoch': 0.95}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7272, 'learning_rate': 3e-06, 'epoch': 1.95}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7201, 'learning_rate': 4.5e-06, 'epoch': 2.95}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7245, 'learning_rate': 3e-06, 'epoch': 3.95}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6916, 'learning_rate': 0.0, 'epoch': 4.95}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 317.9844, 'train_samples_per_second': 1.588, 'train_steps_per_second': 0.094, 'train_loss': 0.7170421759287516, 'epoch': 4.95}
Fitting model: roberta_base using fold 0 as out of fold test data.
Train data sizes: (99, 99).
Test data sizes: (9, 9).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 99
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.705, 'learning_rate': 1.5e-06, 'epoch': 0.97}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7151, 'learning_rate': 3e-06, 'epoch': 1.97}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6998, 'learning_rate': 4.5e-06, 'epoch': 2.97}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7005, 'learning_rate': 3e-06, 'epoch': 3.97}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6646, 'learning_rate': 0.0, 'epoch': 4.97}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 311.2323, 'train_samples_per_second': 1.59, 'train_steps_per_second': 0.096, 'train_loss': 0.6969884157180786, 'epoch': 4.97}
Fitting model: roberta_base using fold 1 as out of fold test data.
Train data sizes: (96, 96).
Test data sizes: (12, 12).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 96
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.6931, 'learning_rate': 1.5e-06, 'epoch': 1.0}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6902, 'learning_rate': 3e-06, 'epoch': 2.0}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6975, 'learning_rate': 4.5e-06, 'epoch': 3.0}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6808, 'learning_rate': 3e-06, 'epoch': 4.0}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6736, 'learning_rate': 0.0, 'epoch': 5.0}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 12
  Batch size = 8


{'train_runtime': 304.9105, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.098, 'train_loss': 0.6870275974273682, 'epoch': 5.0}
Fitting model: roberta_base using fold 2 as out of fold test data.
Train data sizes: (94, 94).
Test data sizes: (14, 14).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 94
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 25
Saving model checkpoint to ./model_output/checkpoint-5
Configuration saved in ./model_output/checkpoint-5/config.json


{'loss': 0.8102, 'learning_rate': 1.25e-06, 'epoch': 0.85}


Model weights saved in ./model_output/checkpoint-5/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-10
Configuration saved in ./model_output/checkpoint-10/config.json


{'loss': 0.8082, 'learning_rate': 2.5e-06, 'epoch': 1.85}


Model weights saved in ./model_output/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-5] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-15
Configuration saved in ./model_output/checkpoint-15/config.json


{'loss': 0.8141, 'learning_rate': 3.7500000000000005e-06, 'epoch': 2.85}


Model weights saved in ./model_output/checkpoint-15/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-20
Configuration saved in ./model_output/checkpoint-20/config.json


{'loss': 0.7896, 'learning_rate': 5e-06, 'epoch': 3.85}


Model weights saved in ./model_output/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-15] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-25
Configuration saved in ./model_output/checkpoint-25/config.json


{'loss': 0.6803, 'learning_rate': 0.0, 'epoch': 4.85}


Model weights saved in ./model_output/checkpoint-25/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 14
  Batch size = 8


{'train_runtime': 288.9058, 'train_samples_per_second': 1.627, 'train_steps_per_second': 0.087, 'train_loss': 0.7804847240447998, 'epoch': 4.85}
Fitting model: roberta_base using fold 3 as out of fold test data.
Train data sizes: (99, 99).
Test data sizes: (9, 9).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 99
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7066, 'learning_rate': 1.5e-06, 'epoch': 0.97}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-25] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7179, 'learning_rate': 3e-06, 'epoch': 1.97}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7111, 'learning_rate': 4.5e-06, 'epoch': 2.97}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7068, 'learning_rate': 3e-06, 'epoch': 3.97}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6779, 'learning_rate': 0.0, 'epoch': 4.97}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 311.5971, 'train_samples_per_second': 1.589, 'train_steps_per_second': 0.096, 'train_loss': 0.7040547370910645, 'epoch': 4.97}
Fitting model: roberta_base using fold 4 as out of fold test data.
Train data sizes: (95, 95).
Test data sizes: (13, 13).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/95 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 95
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 25
Saving model checkpoint to ./model_output/checkpoint-5
Configuration saved in ./model_output/checkpoint-5/config.json


{'loss': 0.8105, 'learning_rate': 1.25e-06, 'epoch': 0.84}


Model weights saved in ./model_output/checkpoint-5/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-10
Configuration saved in ./model_output/checkpoint-10/config.json


{'loss': 0.8092, 'learning_rate': 2.5e-06, 'epoch': 1.84}


Model weights saved in ./model_output/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-5] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-15
Configuration saved in ./model_output/checkpoint-15/config.json


{'loss': 0.8222, 'learning_rate': 3.7500000000000005e-06, 'epoch': 2.84}


Model weights saved in ./model_output/checkpoint-15/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-20
Configuration saved in ./model_output/checkpoint-20/config.json


{'loss': 0.804, 'learning_rate': 5e-06, 'epoch': 3.84}


Model weights saved in ./model_output/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-15] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-25
Configuration saved in ./model_output/checkpoint-25/config.json


{'loss': 0.674, 'learning_rate': 0.0, 'epoch': 4.84}


Model weights saved in ./model_output/checkpoint-25/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 291.7579, 'train_samples_per_second': 1.628, 'train_steps_per_second': 0.086, 'train_loss': 0.7839832305908203, 'epoch': 4.84}
Fitting model: roberta_base using fold 5 as out of fold test data.
Train data sizes: (101, 101).
Test data sizes: (7, 7).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 101
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7198, 'learning_rate': 1.5e-06, 'epoch': 0.95}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-25] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7206, 'learning_rate': 3e-06, 'epoch': 1.95}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7179, 'learning_rate': 4.5e-06, 'epoch': 2.95}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7245, 'learning_rate': 3e-06, 'epoch': 3.95}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6731, 'learning_rate': 0.0, 'epoch': 4.95}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 316.4393, 'train_samples_per_second': 1.596, 'train_steps_per_second': 0.095, 'train_loss': 0.7111618041992187, 'epoch': 4.95}
Fitting model: roberta_base using fold 6 as out of fold test data.
Train data sizes: (96, 96).
Test data sizes: (12, 12).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 96
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.6904, 'learning_rate': 1.5e-06, 'epoch': 1.0}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6957, 'learning_rate': 3e-06, 'epoch': 2.0}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6997, 'learning_rate': 4.5e-06, 'epoch': 3.0}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.681, 'learning_rate': 3e-06, 'epoch': 4.0}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6833, 'learning_rate': 0.0, 'epoch': 5.0}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 12
  Batch size = 8


{'train_runtime': 304.6435, 'train_samples_per_second': 1.576, 'train_steps_per_second': 0.098, 'train_loss': 0.6900257587432861, 'epoch': 5.0}
Fitting model: roberta_base using fold 7 as out of fold test data.
Train data sizes: (98, 98).
Test data sizes: (10, 10).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 98
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7072, 'learning_rate': 1.5e-06, 'epoch': 0.98}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6987, 'learning_rate': 3e-06, 'epoch': 1.98}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7069, 'learning_rate': 4.5e-06, 'epoch': 2.98}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7028, 'learning_rate': 3e-06, 'epoch': 3.98}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6834, 'learning_rate': 0.0, 'epoch': 4.98}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 309.6605, 'train_samples_per_second': 1.582, 'train_steps_per_second': 0.097, 'train_loss': 0.6997810999552408, 'epoch': 4.98}
Fitting model: roberta_base using fold 8 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (4, 4).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7537, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7591, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.755, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7459, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6771, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 4
  Batch size = 8


{'train_runtime': 324.5992, 'train_samples_per_second': 1.602, 'train_steps_per_second': 0.092, 'train_loss': 0.7381697813669841, 'epoch': 4.92}
Fitting model: roberta_base using fold 9 as out of fold test data.
Train data sizes: (97, 97).
Test data sizes: (11, 11).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 97
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.702, 'learning_rate': 1.5e-06, 'epoch': 0.99}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6929, 'learning_rate': 3e-06, 'epoch': 1.99}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6819, 'learning_rate': 4.5e-06, 'epoch': 2.99}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6908, 'learning_rate': 3e-06, 'epoch': 3.99}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6758, 'learning_rate': 0.0, 'epoch': 4.99}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 307.3984, 'train_samples_per_second': 1.578, 'train_steps_per_second': 0.098, 'train_loss': 0.6886796792348225, 'epoch': 4.99}


Could not locate the tokenizer configuration file, will try to use the model config instead.


Fitting model: roberta_base using fold 10 as out of fold test data.
Train data sizes: (101, 101).
Test data sizes: (7, 7).


loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 101
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7206, 'learning_rate': 1.5e-06, 'epoch': 0.95}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7304, 'learning_rate': 3e-06, 'epoch': 1.95}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7228, 'learning_rate': 4.5e-06, 'epoch': 2.95}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7245, 'learning_rate': 3e-06, 'epoch': 3.95}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.69, 'learning_rate': 0.0, 'epoch': 4.95}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 317.6231, 'train_samples_per_second': 1.59, 'train_steps_per_second': 0.094, 'train_loss': 0.7176705837249756, 'epoch': 4.95}
Fitting model: roberta_pysch using fold 0 as out of fold test data.
Train data sizes: (99, 99).
Test data sizes: (9, 9).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 99
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7063, 'learning_rate': 1.5e-06, 'epoch': 0.97}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7147, 'learning_rate': 2.7500000000000004e-06, 'epoch': 1.97}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7004, 'learning_rate': 4.25e-06, 'epoch': 2.97}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7025, 'learning_rate': 3.5e-06, 'epoch': 3.97}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6661, 'learning_rate': 5.000000000000001e-07, 'epoch': 4.97}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 311.4403, 'train_samples_per_second': 1.589, 'train_steps_per_second': 0.096, 'train_loss': 0.6980064153671265, 'epoch': 4.97}
Fitting model: roberta_pysch using fold 1 as out of fold test data.
Train data sizes: (96, 96).
Test data sizes: (12, 12).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 96
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.6942, 'learning_rate': 1.25e-06, 'epoch': 1.0}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6911, 'learning_rate': 2.7500000000000004e-06, 'epoch': 2.0}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6981, 'learning_rate': 4.25e-06, 'epoch': 3.0}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6822, 'learning_rate': 3.5e-06, 'epoch': 4.0}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6765, 'learning_rate': 5.000000000000001e-07, 'epoch': 5.0}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 12
  Batch size = 8


{'train_runtime': 304.733, 'train_samples_per_second': 1.575, 'train_steps_per_second': 0.098, 'train_loss': 0.6884315490722657, 'epoch': 5.0}


Could not locate the tokenizer configuration file, will try to use the model config instead.


Fitting model: roberta_pysch using fold 2 as out of fold test data.
Train data sizes: (94, 94).
Test data sizes: (14, 14).


loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshot

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 94
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 25
Saving model checkpoint to ./model_output/checkpoint-5
Configuration saved in ./model_output/checkpoint-5/config.json


{'loss': 0.8113, 'learning_rate': 1.25e-06, 'epoch': 0.85}


Model weights saved in ./model_output/checkpoint-5/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-10
Configuration saved in ./model_output/checkpoint-10/config.json


{'loss': 0.808, 'learning_rate': 2.5e-06, 'epoch': 1.85}


Model weights saved in ./model_output/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-5] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-15
Configuration saved in ./model_output/checkpoint-15/config.json


{'loss': 0.8126, 'learning_rate': 3.7500000000000005e-06, 'epoch': 2.85}


Model weights saved in ./model_output/checkpoint-15/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-20
Configuration saved in ./model_output/checkpoint-20/config.json


{'loss': 0.7886, 'learning_rate': 5e-06, 'epoch': 3.85}


Model weights saved in ./model_output/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-15] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-25
Configuration saved in ./model_output/checkpoint-25/config.json


{'loss': 0.68, 'learning_rate': 1.0000000000000002e-06, 'epoch': 4.85}


Model weights saved in ./model_output/checkpoint-25/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 14
  Batch size = 8


{'train_runtime': 288.6112, 'train_samples_per_second': 1.628, 'train_steps_per_second': 0.087, 'train_loss': 0.780085563659668, 'epoch': 4.85}
Fitting model: roberta_pysch using fold 3 as out of fold test data.
Train data sizes: (99, 99).
Test data sizes: (9, 9).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 99
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7074, 'learning_rate': 1.5e-06, 'epoch': 0.97}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-25] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7166, 'learning_rate': 3e-06, 'epoch': 1.97}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7109, 'learning_rate': 4.5e-06, 'epoch': 2.97}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7073, 'learning_rate': 3e-06, 'epoch': 3.97}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6782, 'learning_rate': 0.0, 'epoch': 4.97}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 311.8874, 'train_samples_per_second': 1.587, 'train_steps_per_second': 0.096, 'train_loss': 0.7040635108947754, 'epoch': 4.97}


Could not locate the tokenizer configuration file, will try to use the model config instead.


Fitting model: roberta_pysch using fold 4 as out of fold test data.
Train data sizes: (95, 95).
Test data sizes: (13, 13).


loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshot

Map:   0%|          | 0/95 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 95
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 25
Saving model checkpoint to ./model_output/checkpoint-5
Configuration saved in ./model_output/checkpoint-5/config.json


{'loss': 0.8119, 'learning_rate': 1.25e-06, 'epoch': 0.84}


Model weights saved in ./model_output/checkpoint-5/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-10
Configuration saved in ./model_output/checkpoint-10/config.json


{'loss': 0.8085, 'learning_rate': 2.5e-06, 'epoch': 1.84}


Model weights saved in ./model_output/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-5] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-15
Configuration saved in ./model_output/checkpoint-15/config.json


{'loss': 0.8229, 'learning_rate': 3.7500000000000005e-06, 'epoch': 2.84}


Model weights saved in ./model_output/checkpoint-15/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-20
Configuration saved in ./model_output/checkpoint-20/config.json


{'loss': 0.8043, 'learning_rate': 5e-06, 'epoch': 3.84}


Model weights saved in ./model_output/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-15] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-25
Configuration saved in ./model_output/checkpoint-25/config.json


{'loss': 0.6755, 'learning_rate': 0.0, 'epoch': 4.84}


Model weights saved in ./model_output/checkpoint-25/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 291.1296, 'train_samples_per_second': 1.632, 'train_steps_per_second': 0.086, 'train_loss': 0.784617395401001, 'epoch': 4.84}
Fitting model: roberta_pysch using fold 5 as out of fold test data.
Train data sizes: (101, 101).
Test data sizes: (7, 7).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 101
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7212, 'learning_rate': 1.5e-06, 'epoch': 0.95}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-25] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7203, 'learning_rate': 3e-06, 'epoch': 1.95}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7174, 'learning_rate': 4.5e-06, 'epoch': 2.95}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7242, 'learning_rate': 3e-06, 'epoch': 3.95}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6725, 'learning_rate': 0.0, 'epoch': 4.95}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 316.6862, 'train_samples_per_second': 1.595, 'train_steps_per_second': 0.095, 'train_loss': 0.7110920270284017, 'epoch': 4.95}
Fitting model: roberta_pysch using fold 6 as out of fold test data.
Train data sizes: (96, 96).
Test data sizes: (12, 12).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 96
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.6914, 'learning_rate': 1.25e-06, 'epoch': 1.0}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6962, 'learning_rate': 2.7500000000000004e-06, 'epoch': 2.0}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.6982, 'learning_rate': 4.25e-06, 'epoch': 3.0}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6819, 'learning_rate': 3.5e-06, 'epoch': 4.0}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.685, 'learning_rate': 5.000000000000001e-07, 'epoch': 5.0}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 12
  Batch size = 8


{'train_runtime': 303.9639, 'train_samples_per_second': 1.579, 'train_steps_per_second': 0.099, 'train_loss': 0.6905320962270101, 'epoch': 5.0}


Could not locate the tokenizer configuration file, will try to use the model config instead.


Fitting model: roberta_pysch using fold 7 as out of fold test data.
Train data sizes: (98, 98).
Test data sizes: (10, 10).


loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshot

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 98
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7079, 'learning_rate': 1.5e-06, 'epoch': 0.98}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6989, 'learning_rate': 3e-06, 'epoch': 1.98}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7077, 'learning_rate': 4.5e-06, 'epoch': 2.98}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.702, 'learning_rate': 3e-06, 'epoch': 3.98}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6841, 'learning_rate': 0.0, 'epoch': 4.98}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 309.7643, 'train_samples_per_second': 1.582, 'train_steps_per_second': 0.097, 'train_loss': 0.7001181443532308, 'epoch': 4.98}
Fitting model: roberta_pysch using fold 8 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (4, 4).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7535, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7583, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7541, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7461, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.679, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 4
  Batch size = 8


{'train_runtime': 324.0905, 'train_samples_per_second': 1.604, 'train_steps_per_second': 0.093, 'train_loss': 0.7381976127624512, 'epoch': 4.92}
Fitting model: roberta_pysch using fold 9 as out of fold test data.
Train data sizes: (97, 97).
Test data sizes: (11, 11).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 97
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7018, 'learning_rate': 1.5e-06, 'epoch': 0.99}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.6936, 'learning_rate': 3e-06, 'epoch': 1.99}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.682, 'learning_rate': 4.5e-06, 'epoch': 2.99}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.6908, 'learning_rate': 3e-06, 'epoch': 3.99}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6751, 'learning_rate': 0.0, 'epoch': 4.99}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 306.7392, 'train_samples_per_second': 1.581, 'train_steps_per_second': 0.098, 'train_loss': 0.6886349519093832, 'epoch': 4.99}


Could not locate the tokenizer configuration file, will try to use the model config instead.


Fitting model: roberta_pysch using fold 10 as out of fold test data.
Train data sizes: (101, 101).
Test data sizes: (7, 7).


loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshot

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 101
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7218, 'learning_rate': 1.5e-06, 'epoch': 0.95}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7295, 'learning_rate': 3e-06, 'epoch': 1.95}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7227, 'learning_rate': 4.5e-06, 'epoch': 2.95}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7249, 'learning_rate': 3e-06, 'epoch': 3.95}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.69, 'learning_rate': 5.000000000000001e-07, 'epoch': 4.95}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 316.5682, 'train_samples_per_second': 1.595, 'train_steps_per_second': 0.095, 'train_loss': 0.7178015073140462, 'epoch': 4.95}


##### Unassign Runtime if Running on Colab

In [12]:
if params["env"]["colab"]:

    from google.colab import runtime
    runtime.unassign()