### Fine-Tune Language Models

- Joel Stremmel
- 01-11-24

##### About

Fine-Tune pretrained language models on the formatted data using K-Fold Cross-Validation and save the scores.

##### Set Parameters
Pick a model size and provide a list of models and parameters to train within that size.

In [1]:
# used 20 warmup steps for 5 and 10 epoch models and 10 for the 3 epoch models
outcome = 'cohesion' # 'cohesion' # 'Alliance'
params = {
    "env": {"colab": False, "require_high_ram": True},
    "data": {"add_summaries": True},
    "training": {
        "lr": 0.000005,
        "weight_decay": 0.01,
        "adam_beta1": 0.9,
        "adam_beta2": 0.999,
        "adam_epsilon": 0.00000001,
        "warmup_steps": 20,
        "num_workers": 2,
        "epochs": 5,
        "logging_strategy": "epoch"
    },
    "evaluation": {
        "save_strategy": "epoch",
        "save_total_limit": 1,
    },
    "models": {
        "mental_roberta_base": {
            "path": "./models/mental-roberta-base",
            "max_seq_len": 512,
            "fp16": True,
            "batch_size": 1,
            "accumulation_steps": 16,
            "gradient_checkpointing": True,
            "type": "mlm",
        },
        'roberta_base': {
            'path': 'roberta-base',
            'fp16': True,
            'max_seq_len': 512,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': True,
            'type': 'mlm'
        },
        'roberta_pysch': {
            'path': 'mlaricheva/roberta-psych',
            'fp16': True,
            'max_seq_len': 512,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': True,
            'type': 'mlm'
        },
    },
    "io": {
        "results_dir": "./results_fixed_epochs_no_val_5_summaries",
        "input_dir": "./data",
        "model_output_dir": "./model_output",
    },
    "augmentation": {
        "add_synthetic": False,
        "aug_p": 0.2,
        "glove_file": "data/glove.6B.50d.txt",
        "glove_zip": "data/glove.6B.zip",
        "glove_url": "http://nlp.stanford.edu/data/glove.6B.zip",
    },
    "random": {"seed": 42},
}

In [2]:
# # Could use PEFT to save memory

# from peft import LoraConfig, get_peft_model 

# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=["q_lin", "v_lin"],
    
# )

# model = get_peft_model(model, config)

##### Mount Google Drive, Install Requirements, and set Cache if Using Colab

In [3]:
if params["env"]["colab"]:

    import os
    from google.colab import drive

    # Mount
    drive.mount("/content/drive")

    # Install packages
    !pip install -q -r "/content/drive/MyDrive/nlp4psychotherapy/requirements.txt"

    # Set HF cache
    os.environ['TRANSFORMERS_CACHE'] = '/content/drive/MyDrive/hf_cache'
    os.environ['HF_DATASETS_CACHE'] = '/content/drive/MyDrive/hf_cache'

##### Check Colab Runtime

In [4]:
if params["env"]["colab"]:
  
    gpu_info = !nvidia-smi
    gpu_info = "\n".join(gpu_info)
    if gpu_info.find("failed") >= 0:
        print("Not connected to a GPU")
    else:
        print(gpu_info)

if params["env"]["require_high_ram"]:

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print("Your runtime has {:.1f} gigabytes of available RAM\n".format(ram_gb))

    if ram_gb < 20:
        print("Not using a high-RAM runtime")
    else:
        print("You are using a high-RAM runtime!")

Your runtime has 33.6 gigabytes of available RAM

You are using a high-RAM runtime!


##### Imports

In [5]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    GPT2ForSequenceClassification,
    GPTNeoForSequenceClassification,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [7]:
if params["data"]["add_summaries"]:
    with open(os.path.join(params["io"]["input_dir"], f"{outcome}_Xwsum_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

else:
    with open(os.path.join(params["io"]["input_dir"], f"{outcome}_X_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

with open(os.path.join(params["io"]["input_dir"], f"{outcome}_y_folds.pkl"), "rb") as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [8]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [9]:
print(f"Target prevalance: {round(np.mean(np.concatenate(y)), 3)}.")

Target prevalance: 0.607.


##### Check that GPU is Available

In [10]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Select and Preprocess Text and Fit Model to Each Data Fold

In [11]:
y_probs, y_trues = {}, {}
for model in params["models"].keys():

    y_probs[model], y_trues[model] = [], []
    for i in range(len(X)):
        
        # Print model and fold
        print(f"Fitting model: {model} using fold {i} as out of fold test data.")

        # Identify train and test folds
        X_train_temp, y_train_temp = X[0:i] + X[i + 1 :], y[0:i] + y[i + 1 :]
        X_test, y_test = X[i], y[i]

        X_train = np.concatenate(X_train_temp, axis=0)
        y_train = np.concatenate(y_train_temp, axis=0)

        # Shuffle training data
        indices = np.arange(len(y_train))
        np.random.shuffle(indices)
        X_train, y_train = X_train[indices], y_train[indices]

        # Print data shapes
        print(f"Train data sizes: {len(X_train), len(y_train)}.")
        print(f"Test data sizes: {len(X_test), len(y_test)}.")

        # Format text and label data as HuggingFace dataset
        if params["models"][model]["type"] == "seq2seq":
            train_dataset = Dataset.from_dict(
                {"text": X_train, "label_ids": [str(label) for label in y_train]}
            )
            test_dataset = Dataset.from_dict(
                {"text": X_test, "label_ids": [str(label) for label in y_test]}
            )

        else:
            train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
            test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            params["models"][model]["path"]
        )

        # Load model by model type
        if params["models"][model]["type"] == "mlm":
            
            # Load masked language model with a sequence classification head
            lm = AutoModelForSequenceClassification.from_pretrained(
                params["models"][model]["path"],
                num_labels=2,
                return_dict=True,
                problem_type="single_label_classification"
            )

        elif params["models"][model]["type"] == "gpt":
            
            # Use the end of sentence token as a pad token for GPT models
            tokenizer.pad_token = tokenizer.eos_token

            if model == "gpt2":
                
                # Load GPT-2
                lm = GPT2ForSequenceClassification.from_pretrained(
                    params["models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            elif "gpt_neo" in model:
                
                # Load a GPT Neo version
                lm = GPTNeoForSequenceClassification.from_pretrained(
                    params["models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            else:
                raise ValueError("Expected GPT model to be gpt2 or a gpt_neo version.")

        elif params["models"][model]["type"] == "seq2seq":
            lm = AutoModelForSeq2SeqLM.from_pretrained(
                params["models"][model]["path"]
            )
        elif params["models"][model]["type"] == "causal":
            lm = AutoModelForCausalLM.from_pretrained(
                params["models"][model]["path"]
            )
        else:
            raise ValueError(
                f"Unexpected model type: {params[f'{size}_models'][model]['path']}."
            )

        # Define function to preprocess and tokenize text
        if params["models"][model]["type"] == "seq2seq":

            def preprocess_function(
                sample, padding="max_length", output_max_seq_len=20
            ):
                
                # Add prefix to the input for t5
                inputs = [
                    "Classify this text as either 1 or 0: " + item
                    for item in sample["text"]
                ]

                # tokenize inputs
                model_inputs = tokenizer(
                    inputs,
                    max_length=params["models"][model]["max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # Tokenize targets with the `text_target` keyword argument
                labels = tokenizer(
                    text_target=sample["label_ids"],
                    max_length=params["models"][model]["output_max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
                # padding in the loss.
                if padding == "max_length":
                    labels["input_ids"] = [
                        [(l if l != tokenizer.pad_token_id else -100) for l in label]
                        for label in labels["input_ids"]
                    ]

                model_inputs["label_ids"] = labels["input_ids"]

                return model_inputs

        else:

            def preprocess_function(batch):
                return tokenizer(
                    batch["text"],
                    padding="max_length",
                    truncation=True,
                    max_length=params["models"][model]["max_seq_len"],
                )

        # Preprocess datasets
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params["models"][model]["batch_size"],
        )
        train_dataset.set_format("pt")
        test_dataset = test_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params["models"][model]["batch_size"],
        )
        test_dataset.set_format("pt")

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=params["io"]["model_output_dir"],
            num_train_epochs=params["training"]["epochs"],
            warmup_steps=params["training"]["warmup_steps"],
            weight_decay=params["training"]["weight_decay"],
            learning_rate=params["training"]["lr"],
            adam_beta1=params["training"]["adam_beta1"],
            adam_beta2=params["training"]["adam_beta2"],
            adam_epsilon=params["training"]["adam_epsilon"],
            dataloader_num_workers=params["training"]["num_workers"],
            logging_strategy=params["training"]["logging_strategy"],
            seed=params["random"]["seed"],
            run_name=params["models"][model],
            fp16=params["models"][model]["fp16"],
            gradient_checkpointing=params["models"][model][
                "gradient_checkpointing"
            ],
            per_device_train_batch_size=params["models"][model]["batch_size"],
            gradient_accumulation_steps=params["models"][model][
                "accumulation_steps"
            ],
            evaluation_strategy='no',
            save_strategy=params["evaluation"]["save_strategy"],
            save_total_limit=params["evaluation"]["save_total_limit"],
            lr_scheduler_type="linear",
            optim="adamw_torch",
            prediction_loss_only=False,
            load_best_model_at_end=False,
            disable_tqdm=True,
            logging_dir=None,
        )
        
        # Define special training arguments
        if params["models"][model]["type"] == "seq2seq":
            training_args.generation_max_length = params["models"][model]["output_max_seq_len"]
            training_args.predict_with_generate = True
            training_args.generation_num_beams = None

        # Define trainer
        if params["models"][model]["type"] == "seq2seq":
            trainer = Seq2SeqTrainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                callbacks=[],
            )
        else:
            trainer = Trainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                callbacks=[],
            )

        # Train model
        trainer.train()

        # Predict on test dataset for seq2seq models
        if params["models"][model]["type"] == "seq2seq":
            
            # Predict on test dataset with greedy generation
            output = trainer.predict(
                test_dataset,
                do_sample=False,
                max_length=params["models"][model]["output_max_seq_len"],
                early_stopping=True,
            )
            preds_decoded = tokenizer.batch_decode(
                output.predictions, skip_special_tokens=True
            )
            labels = np.where(
                output.label_ids != -100, output.label_ids, tokenizer.pad_token_id
            )
            labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=False)

            # Convert preds to ints
            # We allow additional characters to be generated by check
            # that the first one is a 1 or 0
            preds = []
            for pred in preds_decoded:
                if pred[0] == "1":
                    preds.append(1)
                elif pred[0] == "0":
                    preds.append(0)
                else:
                    print(f"Got unexpected pred: {pred}.")
                    preds.append(np.random.choice([0, 1]))

            # Save scores and labels
            # The labels may contain additional characters, but the first should be
            # a 1 or 0
            y_probs[model].append(preds)
            y_trues[model].append([int(label[0]) for label in labels_decoded])

        # Predict on test set for other model types
        else:
            # Generate scores
            output = trainer.predict(test_dataset)
            labels = output.label_ids
            y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[
                :, 1
            ]

            # Save scores and labels
            y_probs[model].append(y_prob)
            y_trues[model].append(labels)

        # Empty cuda cache
        torch.cuda.empty_cache()
        
if params['data']['add_summaries']:
    sums = 'sum_'
else:
    sums = ''

# Save results
with open(os.path.join(params["io"]["results_dir"], f"{outcome}_{sums}lm_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(params["io"]["results_dir"], f"{outcome}_{sums}lm_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)

Fitting model: mental_roberta_base using fold 0 as out of fold test data.
Train data sizes: (106, 106).
Test data sizes: (11, 11).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 106
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7413, 'learning_rate': 1.5e-06, 'epoch': 0.91}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.742, 'learning_rate': 3e-06, 'epoch': 1.91}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7524, 'learning_rate': 4.5e-06, 'epoch': 2.91}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7473, 'learning_rate': 3e-06, 'epoch': 3.91}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6489, 'learning_rate': 0.0, 'epoch': 4.91}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 371.439, 'train_samples_per_second': 1.427, 'train_steps_per_second': 0.081, 'train_loss': 0.7263916015625, 'epoch': 4.91}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 1 as out of fold test data.
Train data sizes: (102, 102).
Test data sizes: (15, 15).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 102
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7364, 'learning_rate': 1.5e-06, 'epoch': 0.94}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7103, 'learning_rate': 3e-06, 'epoch': 1.94}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7225, 'learning_rate': 4.5e-06, 'epoch': 2.94}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.716, 'learning_rate': 3e-06, 'epoch': 3.94}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6571, 'learning_rate': 0.0, 'epoch': 4.94}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 15
  Batch size = 8


{'train_runtime': 353.7548, 'train_samples_per_second': 1.442, 'train_steps_per_second': 0.085, 'train_loss': 0.7084547201792399, 'epoch': 4.94}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 2 as out of fold test data.
Train data sizes: (103, 103).
Test data sizes: (14, 14).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 103
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7472, 'learning_rate': 1.5e-06, 'epoch': 0.93}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7495, 'learning_rate': 3e-06, 'epoch': 1.93}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7353, 'learning_rate': 4.5e-06, 'epoch': 2.93}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7334, 'learning_rate': 3e-06, 'epoch': 3.93}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6658, 'learning_rate': 0.0, 'epoch': 4.93}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 14
  Batch size = 8


{'train_runtime': 322.0986, 'train_samples_per_second': 1.599, 'train_steps_per_second': 0.093, 'train_loss': 0.7262441873550415, 'epoch': 4.93}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 3 as out of fold test data.
Train data sizes: (107, 107).
Test data sizes: (10, 10).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 107
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7809, 'learning_rate': 1.5e-06, 'epoch': 0.9}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7734, 'learning_rate': 3e-06, 'epoch': 1.9}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7626, 'learning_rate': 4.5e-06, 'epoch': 2.9}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7566, 'learning_rate': 3e-06, 'epoch': 3.9}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6811, 'learning_rate': 0.0, 'epoch': 4.9}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 333.0063, 'train_samples_per_second': 1.607, 'train_steps_per_second': 0.09, 'train_loss': 0.7509049892425537, 'epoch': 4.9}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 4 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (13, 13).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7331, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7324, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7406, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7268, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6688, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 325.4123, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.092, 'train_loss': 0.7203264554341634, 'epoch': 4.92}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 5 as out of fold test data.
Train data sizes: (108, 108).
Test data sizes: (9, 9).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 108
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7633, 'learning_rate': 1.5e-06, 'epoch': 0.89}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7586, 'learning_rate': 3e-06, 'epoch': 1.89}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7629, 'learning_rate': 4.5e-06, 'epoch': 2.89}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7825, 'learning_rate': 3e-06, 'epoch': 3.89}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6761, 'learning_rate': 0.0, 'epoch': 4.89}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 335.0552, 'train_samples_per_second': 1.612, 'train_steps_per_second': 0.09, 'train_loss': 0.7486798127492269, 'epoch': 4.89}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 6 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (13, 13).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7407, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.734, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7302, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7203, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6655, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 325.9619, 'train_samples_per_second': 1.595, 'train_steps_per_second': 0.092, 'train_loss': 0.718153174718221, 'epoch': 4.92}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 7 as out of fold test data.
Train data sizes: (107, 107).
Test data sizes: (10, 10).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 107
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7658, 'learning_rate': 1.5e-06, 'epoch': 0.9}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7718, 'learning_rate': 3e-06, 'epoch': 1.9}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7714, 'learning_rate': 4.5e-06, 'epoch': 2.9}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7518, 'learning_rate': 3e-06, 'epoch': 3.9}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6777, 'learning_rate': 5.000000000000001e-07, 'epoch': 4.9}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 332.5715, 'train_samples_per_second': 1.609, 'train_steps_per_second': 0.09, 'train_loss': 0.7476903279622396, 'epoch': 4.9}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 8 as out of fold test data.
Train data sizes: (113, 113).
Test data sizes: (4, 4).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 113
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 35
Saving model checkpoint to ./model_output/checkpoint-7
Configuration saved in ./model_output/checkpoint-7/config.json


{'loss': 0.7014, 'learning_rate': 1.75e-06, 'epoch': 0.99}


Model weights saved in ./model_output/checkpoint-7/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-14
Configuration saved in ./model_output/checkpoint-14/config.json


{'loss': 0.6993, 'learning_rate': 3.5e-06, 'epoch': 1.99}


Model weights saved in ./model_output/checkpoint-14/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-7] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-21
Configuration saved in ./model_output/checkpoint-21/config.json


{'loss': 0.6907, 'learning_rate': 4.666666666666667e-06, 'epoch': 2.99}


Model weights saved in ./model_output/checkpoint-21/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-14] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-28
Configuration saved in ./model_output/checkpoint-28/config.json


{'loss': 0.682, 'learning_rate': 2.3333333333333336e-06, 'epoch': 3.99}


Model weights saved in ./model_output/checkpoint-28/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-21] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-35
Configuration saved in ./model_output/checkpoint-35/config.json


{'loss': 0.6744, 'learning_rate': 0.0, 'epoch': 4.99}


Model weights saved in ./model_output/checkpoint-35/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-28] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 4
  Batch size = 8


{'train_runtime': 359.676, 'train_samples_per_second': 1.571, 'train_steps_per_second': 0.097, 'train_loss': 0.6895504406520299, 'epoch': 4.99}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 9 as out of fold test data.
Train data sizes: (106, 106).
Test data sizes: (11, 11).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 106
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7503, 'learning_rate': 1.5e-06, 'epoch': 0.91}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-35] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7731, 'learning_rate': 3e-06, 'epoch': 1.91}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7451, 'learning_rate': 4.5e-06, 'epoch': 2.91}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7528, 'learning_rate': 3e-06, 'epoch': 3.91}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6929, 'learning_rate': 0.0, 'epoch': 4.91}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 330.6464, 'train_samples_per_second': 1.603, 'train_steps_per_second': 0.091, 'train_loss': 0.7428407192230224, 'epoch': 4.91}


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./models/mental-roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/mental-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,

Fitting model: mental_roberta_base using fold 10 as out of fold test data.
Train data sizes: (110, 110).
Test data sizes: (7, 7).


Some weights of the model checkpoint at ./models/mental-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/mental-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifi

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 110
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7906, 'learning_rate': 1.5e-06, 'epoch': 0.87}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7973, 'learning_rate': 3e-06, 'epoch': 1.87}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7745, 'learning_rate': 4.5e-06, 'epoch': 2.87}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7797, 'learning_rate': 3e-06, 'epoch': 3.87}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6699, 'learning_rate': 0.0, 'epoch': 4.87}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 340.9852, 'train_samples_per_second': 1.613, 'train_steps_per_second': 0.088, 'train_loss': 0.7624256769816081, 'epoch': 4.87}
Fitting model: roberta_base using fold 0 as out of fold test data.
Train data sizes: (106, 106).
Test data sizes: (11, 11).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 106
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7658, 'learning_rate': 1.5e-06, 'epoch': 0.91}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7685, 'learning_rate': 3e-06, 'epoch': 1.91}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.749, 'learning_rate': 4.5e-06, 'epoch': 2.91}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7397, 'learning_rate': 3e-06, 'epoch': 3.91}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6585, 'learning_rate': 0.0, 'epoch': 4.91}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 331.4751, 'train_samples_per_second': 1.599, 'train_steps_per_second': 0.091, 'train_loss': 0.7362998485565185, 'epoch': 4.91}
Fitting model: roberta_base using fold 1 as out of fold test data.
Train data sizes: (102, 102).
Test data sizes: (15, 15).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 102
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7348, 'learning_rate': 1.5e-06, 'epoch': 0.94}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7107, 'learning_rate': 3e-06, 'epoch': 1.94}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7224, 'learning_rate': 4.5e-06, 'epoch': 2.94}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7151, 'learning_rate': 3e-06, 'epoch': 3.94}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6552, 'learning_rate': 0.0, 'epoch': 4.94}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 15
  Batch size = 8


{'train_runtime': 321.755, 'train_samples_per_second': 1.585, 'train_steps_per_second': 0.093, 'train_loss': 0.7076404968897502, 'epoch': 4.94}
Fitting model: roberta_base using fold 2 as out of fold test data.
Train data sizes: (103, 103).
Test data sizes: (14, 14).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 103
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7469, 'learning_rate': 1.5e-06, 'epoch': 0.93}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7482, 'learning_rate': 3e-06, 'epoch': 1.93}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7355, 'learning_rate': 4.5e-06, 'epoch': 2.93}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7342, 'learning_rate': 3e-06, 'epoch': 3.93}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6664, 'learning_rate': 0.0, 'epoch': 4.93}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 14
  Batch size = 8


{'train_runtime': 324.4793, 'train_samples_per_second': 1.587, 'train_steps_per_second': 0.092, 'train_loss': 0.7262330770492553, 'epoch': 4.93}
Fitting model: roberta_base using fold 3 as out of fold test data.
Train data sizes: (107, 107).
Test data sizes: (10, 10).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 107
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7782, 'learning_rate': 1.5e-06, 'epoch': 0.9}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7737, 'learning_rate': 3e-06, 'epoch': 1.9}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7634, 'learning_rate': 4.5e-06, 'epoch': 2.9}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7564, 'learning_rate': 3e-06, 'epoch': 3.9}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6813, 'learning_rate': 0.0, 'epoch': 4.9}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 334.2878, 'train_samples_per_second': 1.6, 'train_steps_per_second': 0.09, 'train_loss': 0.750579818089803, 'epoch': 4.9}
Fitting model: roberta_base using fold 4 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (13, 13).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.733, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7317, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7419, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7285, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6699, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 325.9965, 'train_samples_per_second': 1.595, 'train_steps_per_second': 0.092, 'train_loss': 0.7209977308909098, 'epoch': 4.92}
Fitting model: roberta_base using fold 5 as out of fold test data.
Train data sizes: (108, 108).
Test data sizes: (9, 9).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 108
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7616, 'learning_rate': 1.25e-06, 'epoch': 0.89}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7596, 'learning_rate': 2.7500000000000004e-06, 'epoch': 1.89}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7665, 'learning_rate': 4.25e-06, 'epoch': 2.89}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7795, 'learning_rate': 3.5e-06, 'epoch': 3.89}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6739, 'learning_rate': 5.000000000000001e-07, 'epoch': 4.89}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 335.2966, 'train_samples_per_second': 1.611, 'train_steps_per_second': 0.089, 'train_loss': 0.7482221444447835, 'epoch': 4.89}
Fitting model: roberta_base using fold 6 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (13, 13).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7402, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7326, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.731, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.724, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6654, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 326.6299, 'train_samples_per_second': 1.592, 'train_steps_per_second': 0.092, 'train_loss': 0.7186389128367107, 'epoch': 4.92}
Fitting model: roberta_base using fold 7 as out of fold test data.
Train data sizes: (107, 107).
Test data sizes: (10, 10).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 107
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7692, 'learning_rate': 1.5e-06, 'epoch': 0.9}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7734, 'learning_rate': 3e-06, 'epoch': 1.9}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7754, 'learning_rate': 4.5e-06, 'epoch': 2.9}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7501, 'learning_rate': 3e-06, 'epoch': 3.9}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6788, 'learning_rate': 5.000000000000001e-07, 'epoch': 4.9}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 333.033, 'train_samples_per_second': 1.606, 'train_steps_per_second': 0.09, 'train_loss': 0.7493694464365641, 'epoch': 4.9}
Fitting model: roberta_base using fold 8 as out of fold test data.
Train data sizes: (113, 113).
Test data sizes: (4, 4).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 113
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 35
Saving model checkpoint to ./model_output/checkpoint-7
Configuration saved in ./model_output/checkpoint-7/config.json


{'loss': 0.6986, 'learning_rate': 1.75e-06, 'epoch': 0.99}


Model weights saved in ./model_output/checkpoint-7/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-14
Configuration saved in ./model_output/checkpoint-14/config.json


{'loss': 0.7006, 'learning_rate': 3.5e-06, 'epoch': 1.99}


Model weights saved in ./model_output/checkpoint-14/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-7] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-21
Configuration saved in ./model_output/checkpoint-21/config.json


{'loss': 0.6898, 'learning_rate': 4.666666666666667e-06, 'epoch': 2.99}


Model weights saved in ./model_output/checkpoint-21/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-14] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-28
Configuration saved in ./model_output/checkpoint-28/config.json


{'loss': 0.6833, 'learning_rate': 2.3333333333333336e-06, 'epoch': 3.99}


Model weights saved in ./model_output/checkpoint-28/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-21] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-35
Configuration saved in ./model_output/checkpoint-35/config.json


{'loss': 0.6749, 'learning_rate': 0.0, 'epoch': 4.99}


Model weights saved in ./model_output/checkpoint-35/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-28] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 4
  Batch size = 8


{'train_runtime': 359.0246, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.097, 'train_loss': 0.689446599142892, 'epoch': 4.99}
Fitting model: roberta_base using fold 9 as out of fold test data.
Train data sizes: (106, 106).
Test data sizes: (11, 11).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 106
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7534, 'learning_rate': 1.5e-06, 'epoch': 0.91}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-35] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7735, 'learning_rate': 3e-06, 'epoch': 1.91}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7467, 'learning_rate': 4.5e-06, 'epoch': 2.91}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7549, 'learning_rate': 3e-06, 'epoch': 3.91}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6925, 'learning_rate': 0.0, 'epoch': 4.91}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 331.3222, 'train_samples_per_second': 1.6, 'train_steps_per_second': 0.091, 'train_loss': 0.744187879562378, 'epoch': 4.91}
Fitting model: roberta_base using fold 10 as out of fold test data.
Train data sizes: (110, 110).
Test data sizes: (7, 7).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/joel_stremm

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 110
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7924, 'learning_rate': 1.5e-06, 'epoch': 0.87}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7968, 'learning_rate': 3e-06, 'epoch': 1.87}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7783, 'learning_rate': 4.5e-06, 'epoch': 2.87}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7796, 'learning_rate': 3e-06, 'epoch': 3.87}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6699, 'learning_rate': 0.0, 'epoch': 4.87}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 340.6469, 'train_samples_per_second': 1.615, 'train_steps_per_second': 0.088, 'train_loss': 0.7633827686309814, 'epoch': 4.87}
Fitting model: roberta_pysch using fold 0 as out of fold test data.
Train data sizes: (106, 106).
Test data sizes: (11, 11).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 106
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7659, 'learning_rate': 1.5e-06, 'epoch': 0.91}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7674, 'learning_rate': 3e-06, 'epoch': 1.91}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7469, 'learning_rate': 4.5e-06, 'epoch': 2.91}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7376, 'learning_rate': 3e-06, 'epoch': 3.91}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6581, 'learning_rate': 0.0, 'epoch': 4.91}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 331.5526, 'train_samples_per_second': 1.599, 'train_steps_per_second': 0.09, 'train_loss': 0.7351715485254924, 'epoch': 4.91}
Fitting model: roberta_pysch using fold 1 as out of fold test data.
Train data sizes: (102, 102).
Test data sizes: (15, 15).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 102
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7366, 'learning_rate': 1.5e-06, 'epoch': 0.94}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7103, 'learning_rate': 3e-06, 'epoch': 1.94}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7196, 'learning_rate': 4.5e-06, 'epoch': 2.94}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7132, 'learning_rate': 3e-06, 'epoch': 3.94}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6528, 'learning_rate': 0.0, 'epoch': 4.94}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 15
  Batch size = 8


{'train_runtime': 321.2204, 'train_samples_per_second': 1.588, 'train_steps_per_second': 0.093, 'train_loss': 0.7065075079600016, 'epoch': 4.94}
Fitting model: roberta_pysch using fold 2 as out of fold test data.
Train data sizes: (103, 103).
Test data sizes: (14, 14).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 103
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7476, 'learning_rate': 1.5e-06, 'epoch': 0.93}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7472, 'learning_rate': 3e-06, 'epoch': 1.93}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7351, 'learning_rate': 4.5e-06, 'epoch': 2.93}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7334, 'learning_rate': 3e-06, 'epoch': 3.93}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6646, 'learning_rate': 0.0, 'epoch': 4.93}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 14
  Batch size = 8


{'train_runtime': 324.1729, 'train_samples_per_second': 1.589, 'train_steps_per_second': 0.093, 'train_loss': 0.7255936384201049, 'epoch': 4.93}
Fitting model: roberta_pysch using fold 3 as out of fold test data.
Train data sizes: (107, 107).
Test data sizes: (10, 10).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 107
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.779, 'learning_rate': 1.5e-06, 'epoch': 0.9}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7725, 'learning_rate': 3e-06, 'epoch': 1.9}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7618, 'learning_rate': 4.5e-06, 'epoch': 2.9}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.755, 'learning_rate': 3e-06, 'epoch': 3.9}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6811, 'learning_rate': 0.0, 'epoch': 4.9}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 333.2243, 'train_samples_per_second': 1.606, 'train_steps_per_second': 0.09, 'train_loss': 0.7498809973398844, 'epoch': 4.9}
Fitting model: roberta_pysch using fold 4 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (13, 13).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7355, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7325, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7403, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7291, 'learning_rate': 3e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6698, 'learning_rate': 0.0, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 326.8034, 'train_samples_per_second': 1.591, 'train_steps_per_second': 0.092, 'train_loss': 0.7214337666829427, 'epoch': 4.92}
Fitting model: roberta_pysch using fold 5 as out of fold test data.
Train data sizes: (108, 108).
Test data sizes: (9, 9).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 108
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7631, 'learning_rate': 1.5e-06, 'epoch': 0.89}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7601, 'learning_rate': 3e-06, 'epoch': 1.89}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7645, 'learning_rate': 4.5e-06, 'epoch': 2.89}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7783, 'learning_rate': 3e-06, 'epoch': 3.89}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6742, 'learning_rate': 0.0, 'epoch': 4.89}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 9
  Batch size = 8


{'train_runtime': 336.4529, 'train_samples_per_second': 1.605, 'train_steps_per_second': 0.089, 'train_loss': 0.748039436340332, 'epoch': 4.89}
Fitting model: roberta_pysch using fold 6 as out of fold test data.
Train data sizes: (104, 104).
Test data sizes: (13, 13).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 104
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.743, 'learning_rate': 1.5e-06, 'epoch': 0.92}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.732, 'learning_rate': 3e-06, 'epoch': 1.92}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7286, 'learning_rate': 4.5e-06, 'epoch': 2.92}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7249, 'learning_rate': 3.5e-06, 'epoch': 3.92}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6655, 'learning_rate': 5.000000000000001e-07, 'epoch': 4.92}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 13
  Batch size = 8


{'train_runtime': 326.8483, 'train_samples_per_second': 1.591, 'train_steps_per_second': 0.092, 'train_loss': 0.7188148975372315, 'epoch': 4.92}
Fitting model: roberta_pysch using fold 7 as out of fold test data.
Train data sizes: (107, 107).
Test data sizes: (10, 10).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 107
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7683, 'learning_rate': 1.5e-06, 'epoch': 0.9}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7736, 'learning_rate': 3e-06, 'epoch': 1.9}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7728, 'learning_rate': 4.5e-06, 'epoch': 2.9}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7499, 'learning_rate': 3e-06, 'epoch': 3.9}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6782, 'learning_rate': 0.0, 'epoch': 4.9}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 334.1214, 'train_samples_per_second': 1.601, 'train_steps_per_second': 0.09, 'train_loss': 0.7485512733459473, 'epoch': 4.9}
Fitting model: roberta_pysch using fold 8 as out of fold test data.
Train data sizes: (113, 113).
Test data sizes: (4, 4).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 113
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 35
Saving model checkpoint to ./model_output/checkpoint-7
Configuration saved in ./model_output/checkpoint-7/config.json


{'loss': 0.6988, 'learning_rate': 1.75e-06, 'epoch': 0.99}


Model weights saved in ./model_output/checkpoint-7/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-14
Configuration saved in ./model_output/checkpoint-14/config.json


{'loss': 0.6994, 'learning_rate': 3.5e-06, 'epoch': 1.99}


Model weights saved in ./model_output/checkpoint-14/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-7] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-21
Configuration saved in ./model_output/checkpoint-21/config.json


{'loss': 0.689, 'learning_rate': 4.666666666666667e-06, 'epoch': 2.99}


Model weights saved in ./model_output/checkpoint-21/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-14] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-28
Configuration saved in ./model_output/checkpoint-28/config.json


{'loss': 0.6839, 'learning_rate': 2.3333333333333336e-06, 'epoch': 3.99}


Model weights saved in ./model_output/checkpoint-28/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-21] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-35
Configuration saved in ./model_output/checkpoint-35/config.json


{'loss': 0.6746, 'learning_rate': 0.0, 'epoch': 4.99}


Model weights saved in ./model_output/checkpoint-35/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-28] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 4
  Batch size = 8


{'train_runtime': 358.6564, 'train_samples_per_second': 1.575, 'train_steps_per_second': 0.098, 'train_loss': 0.6891368593488421, 'epoch': 4.99}
Fitting model: roberta_pysch using fold 9 as out of fold test data.
Train data sizes: (106, 106).
Test data sizes: (11, 11).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 106
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7535, 'learning_rate': 1.5e-06, 'epoch': 0.91}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-35] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.773, 'learning_rate': 3e-06, 'epoch': 1.91}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7472, 'learning_rate': 4.5e-06, 'epoch': 2.91}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7549, 'learning_rate': 3e-06, 'epoch': 3.91}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.6913, 'learning_rate': 0.0, 'epoch': 4.91}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 11
  Batch size = 8


{'train_runtime': 331.6364, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.09, 'train_loss': 0.7439727783203125, 'epoch': 4.91}
Fitting model: roberta_pysch using fold 10 as out of fold test data.
Train data sizes: (110, 110).
Test data sizes: (7, 7).


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/joel_stremmel/.cache/huggingface/hub/models--mlaricheva--roberta-psych/snapshots/18f4eb3ec5e26053f262d6e19f98ede75673ff33/config.json
Model config RobertaConfig {
  "_name_or_path": "mlaricheva/roberta-psych",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from c

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 110
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 30
Saving model checkpoint to ./model_output/checkpoint-6
Configuration saved in ./model_output/checkpoint-6/config.json


{'loss': 0.7918, 'learning_rate': 1.5e-06, 'epoch': 0.87}


Model weights saved in ./model_output/checkpoint-6/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-30] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-12
Configuration saved in ./model_output/checkpoint-12/config.json


{'loss': 0.7967, 'learning_rate': 3e-06, 'epoch': 1.87}


Model weights saved in ./model_output/checkpoint-12/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-6] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-18
Configuration saved in ./model_output/checkpoint-18/config.json


{'loss': 0.7771, 'learning_rate': 4.5e-06, 'epoch': 2.87}


Model weights saved in ./model_output/checkpoint-18/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-12] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-24
Configuration saved in ./model_output/checkpoint-24/config.json


{'loss': 0.7794, 'learning_rate': 3e-06, 'epoch': 3.87}


Model weights saved in ./model_output/checkpoint-24/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-18] due to args.save_total_limit
Saving model checkpoint to ./model_output/checkpoint-30
Configuration saved in ./model_output/checkpoint-30/config.json


{'loss': 0.67, 'learning_rate': 0.0, 'epoch': 4.87}


Model weights saved in ./model_output/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [model_output/checkpoint-24] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 7
  Batch size = 8


{'train_runtime': 340.6487, 'train_samples_per_second': 1.615, 'train_steps_per_second': 0.088, 'train_loss': 0.7629999955495198, 'epoch': 4.87}


##### Unassign Runtime if Running on Colab

In [12]:
if params["env"]["colab"]:

    from google.colab import runtime
    runtime.unassign()