In [1]:
import evaluate
import numpy as np
import torch
import time
import pandas as pd

from copy import deepcopy
from ast import literal_eval
from datasets import load_from_disk
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, TrainerCallback
from sklearn.utils.class_weight import compute_class_weight
from pathlib import Path
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.metrics import accuracy_score, f1_score



In [2]:
data_path = ""
# Path to Huggingface pre-processed dataset

experiment_path = Path(f"exps/experiment_{int(time.time())}")
experiment_path.mkdir(parents=True, exist_ok=True)
output_path=experiment_path
# "Path to store the fine-tuned model",

model_name=""
# "Name of the pre-trained LLM to fine-tune",

max_length=""
# "Maximum length of the input sequences",

set_pad_id="" 
# "Set the id for the padding token, needed by models such as Mistral-7B",

lr=1e-5
# Learning rate for training"

train_batch_size=36
# Train batch size

eval_batch_size=36
# Eval batch size"

num_epochs=5
# Number of epochs"

weight_decay=0.1
# Weight decay"

lora_rank=4,
# help="Lora rank"


lora_alpha=0.0
# Lora alpha"

lora_dropout=0.2
# Lora dropout"

lora_bias=None
# choices={"lora_only", "none", 'all'},
# help="Layers to add learnable bias"

In [3]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred
    labels = labels.astype(np.int32) 
    
    logits_tensor = torch.from_numpy(logits)
    pred_labels_cpu = torch.sigmoid(logits_tensor).detach().cpu().numpy()
    predictions = np.where(pred_labels_cpu<0.5, 0, 1).astype(np.int32) 


    print(f"predictions = {predictions} {predictions.shape}")
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    f1 = f1_score(y_true=labels, y_pred=predictions, average="micro")
    f1_macro = f1_score(y_true=labels, y_pred=predictions, average="macro")
    f1_weighted = f1_score(y_true=labels, y_pred=predictions, average="weighted")

    return {"accuracy": accuracy, "f1": f1, "f1_macro": f1_macro, 'f1_weighted': f1_weighted}

In [4]:
class CustomCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [5]:
def compute_class_weights(df):
    # Flatten the 'keys' column
    all_labels = [label for sublist in df['RGNTI_L1'] for label in sublist]

    # Get unique classes
    unique_classes = np.unique(all_labels)

    # Calculate class weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=unique_classes,
        y=all_labels
    )

    # Create a dictionary mapping from class to weight
    class_weights_dict = dict(zip(unique_classes, class_weights))

    print("Class weights:", class_weights_dict)
    return class_weights, class_weights_dict

In [6]:
def get_dataset_and_collator(teach_df,
                             test_df,
                             model_checkpoints,
                             add_prefix_space=True,
                             max_length=512,
                             truncation=True,
                             set_pad_id=False
):
    """
    Load the preprocessed HF dataset with train, valid and test objects
    
    Paramters:
    ---------
    data_path: str 
        Path to the pre-processed HuggingFace dataset 
    model_checkpoints: 
        Name of the pre-trained model to use for tokenization
    """

    train_dataset = Dataset.from_pandas(teach_df)
    test_dataset = Dataset.from_pandas(test_df)
    data = DatasetDict({"train":train_dataset,"test":test_dataset})
    
    # data = Dataset.from_pandas(df)
    # data = data.train_test_split(test_size=0.2, shuffle=False)

    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoints,
        add_prefix_space=add_prefix_space
    )

    if set_pad_id:
        tokenizer.pad_token = tokenizer.eos_token

    def _preprocesscing_function(examples):
        return tokenizer(examples['data'], truncation=truncation, max_length=max_length)

    col_to_delete = ['data', 'RGNTI_L1', 'RGNTI_L2']
    tokenized_datasets = data.map(_preprocesscing_function, batched=False)
    tokenized_datasets = tokenized_datasets.remove_columns(col_to_delete)
    tokenized_datasets = tokenized_datasets.rename_column("onehot_level1", "label")
    tokenized_datasets.set_format("torch")

    padding_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    return tokenized_datasets, padding_collator

In [7]:
def get_lora_model(model_checkpoints, id2label, label2id, num_labels=46, rank=4, alpha=16, lora_dropout=0.1, bias='none'):
    """
    TODO
    """
    model = AutoModelForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_checkpoints,
            problem_type="multi_label_classification",
            num_labels=num_labels,
            # device_map="auto",
            offload_folder="offload",
            trust_remote_code=True,
            id2label=id2label,
            label2id=label2id
        )

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS, r=rank, lora_alpha=alpha, lora_dropout=lora_dropout, bias=bias,
    )
    model = get_peft_model(model, peft_config)
    print(model.print_trainable_parameters())

    return model

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

def get_one_level_labels(df, field):
    # new_field_title = field + "_proc"
    df[field] = df[field].apply(literal_eval)
    mlb = MultiLabelBinarizer()
    sparce_labels = mlb.fit_transform(df[field])
    level_classes = mlb.classes_
    print(f"Unique {field} classes: {len(level_classes)}")

    return level_classes, sparce_labels

In [9]:
df_train = pd.read_csv("C:\\PowerfullProject\\teach_slice_80_l2_drop_wasted_v2.csv", sep="\t")
df_test = pd.read_csv("C:\\PowerfullProject\\test_slice_20_l2_drop_wasted_v2.csv", sep="\t")
result = pd.concat([df_train, df_test])
l1_classes, l1_labels = get_one_level_labels(result, 'RGNTI_L1')
l1_labels = l1_labels.astype(float) 
result['onehot_level1'] = l1_labels.tolist()
id2label = {idx:label for idx, label in enumerate(l1_classes)}
label2id = {label:idx for idx, label in enumerate(l1_classes)}
class_weights, class_weights_dict = compute_class_weights(result)


Unique RGNTI_L1 classes: 46
Class weights: {'00': 240.37649653434153, '06': 0.7880447693671101, '12': 20.25149970802145, '15': 2.8780111505933657, '19': 16.29270949004869, '20': 2.3456340349129636, '27': 1.7132505175983437, '28': 1.9728771572343957, '29': 0.44950886933897677, '30': 0.9502680337383732, '31': 0.3033557980954653, '34': 0.17008643040424104, '36': 3.0851894086438922, '37': 1.6454343512767426, '38': 0.5185548932583888, '39': 0.6788906823654196, '41': 3.3588453343194744, '44': 0.6615075284517037, '45': 1.7644657724329325, '47': 0.8376756697408871, '49': 1.6667649744618194, '50': 0.9887908823697129, '52': 0.5179395515994618, '53': 0.5805179468996383, '55': 0.3665733603162614, '58': 1842.8864734299516, '59': 1507.8162055335968, '60': 4.196856847386024, '61': 0.33185894597470067, '62': 3.4360841642572124, '64': 4.85253898796652, '65': 1.979469896272773, '66': 10.770115753811407, '67': 21.9391246836899, '68': 1.0558937013540595, '69': 30.432987634623057, '70': 2.6352046807863942,

In [10]:
l1_labels

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [93]:
result

Unnamed: 0,data,RGNTI_L1,RGNTI_L2,onehot_level1
0,Распределение ртути в компонентах окружающей с...,"[87, 38, 52]","['52.01', '38.63', '52.45', '87.23', '87.03']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1,Химический состав фракций обломочного материал...,"[87, 38, 52]","['87.19', '87.53', '52.13', '87.23', '38.33']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,НАНОРАЗМЕРНЫЕ ЧАСТИЦЫ ПРИ ОСВОЕНИИ НЕДР: ОБРАЗ...,"[61, 52, 87]","['61.53', '87.23', '52.45', '52.01']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Анализ условий восстановления растительности н...,"[87, 38, 52]","['38.63', '38.33', '87.23', '52.01']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,Обобщенная оценка влияния горного предприятия ...,"[87, 38, 52]","['38.33', '87.23', '52.45', '52.01']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...
84448,Неравенства Харди в предельном случае на сфере...,[27],['27.25'],"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
84449,Неравенство Пуанкаре и $p$-связность стратифиц...,[27],['27.25'],"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
84450,Неулучшаемость теоремы Дирихле для многочленов...,[27],['27.15'],"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
84451,Основные направления реализации программы импо...,[59],['59.01'],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
# df_test = pd.read_csv("C:\\PowerfullProject\\test_slice_20_l2_drop_wasted_v2.csv", sep="\t")
embedding_model_name = "miemBertProject/miem-scibert-linguistic"
dataset, collator =  get_dataset_and_collator(result[:514795],
                                              result[514795:],
                                              embedding_model_name,
                                              max_length=256,
                                              set_pad_id=False,
                                              add_prefix_space=True,
                                              truncation=True,)


Map:   0%|          | 0/514795 [00:00<?, ? examples/s]

Map:   0%|          | 0/84453 [00:00<?, ? examples/s]

In [53]:
from sklearn.utils.class_weight import compute_class_weight

import numpy as np

df = result
# df['RGNTI_L1'] = df['label'].apply(literal_eval)

# Flatten the 'keys' column
all_labels = [label for sublist in df['RGNTI_L1'] for label in sublist]

# Get unique classes
unique_classes = np.unique(all_labels)

# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=unique_classes,
    y=all_labels
)

# Create a dictionary mapping from class to weight
class_weights_dict = dict(zip(unique_classes, class_weights))

print("Class weights:", class_weights_dict)
print(class_weights)
class_weights = torch.tensor(list(class_weights_dict.values()))
print(class_weights)


Class weights: {'00': 240.37649653434153, '06': 0.7880447693671101, '12': 20.25149970802145, '15': 2.8780111505933657, '19': 16.29270949004869, '20': 2.3456340349129636, '27': 1.7132505175983437, '28': 1.9728771572343957, '29': 0.44950886933897677, '30': 0.9502680337383732, '31': 0.3033557980954653, '34': 0.17008643040424104, '36': 3.0851894086438922, '37': 1.6454343512767426, '38': 0.5185548932583888, '39': 0.6788906823654196, '41': 3.3588453343194744, '44': 0.6615075284517037, '45': 1.7644657724329325, '47': 0.8376756697408871, '49': 1.6667649744618194, '50': 0.9887908823697129, '52': 0.5179395515994618, '53': 0.5805179468996383, '55': 0.3665733603162614, '58': 1842.8864734299516, '59': 1507.8162055335968, '60': 4.196856847386024, '61': 0.33185894597470067, '62': 3.4360841642572124, '64': 4.85253898796652, '65': 1.979469896272773, '66': 10.770115753811407, '67': 21.9391246836899, '68': 1.0558937013540595, '69': 30.432987634623057, '70': 2.6352046807863942, '73': 0.5803960618983646, '

In [11]:
def get_weighted_trainer(class_weights):
    
    class _WeightedBCELossTrainer(Trainer):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            # self.loss_fct = torch.nn.BCEWithLogitsLoss(weight=torch.tensor(class_weights, device='cuda', dtype=torch.float))
            self.loss_fct = torch.nn.BCEWithLogitsLoss()

        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.pop("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get("logits")
            labels = labels.float() 
            # compute custom loss (suppose one has 3 labels with different weights)
            # loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.5, 1.5], device=labels.device, dtype=logits.dtype))
            loss = self.loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss
    return _WeightedBCELossTrainer

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 514795
    })
    test: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 84453
    })
})

In [12]:
model = get_lora_model(
    embedding_model_name,
    id2label=id2label,
    label2id=label2id
)
# rank=lora_rank,
#     alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     bias='none'
# )

# if args.set_pad_id: 
    # model.config.pad_token_id = model.config.eos_token_id

# move model to GPU device
if model.device.type != 'cuda':
    model=model.to('cuda')

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at miemBertProject/miem-scibert-linguistic and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 182,830 || all params: 178,071,644 || trainable%: 0.1027
None


In [13]:
"""
Training function
"""
training_args = TrainingArguments(
    output_dir=output_path,
    learning_rate=lr,
    lr_scheduler_type="cosine",
    warmup_ratio= 0.1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    fp16=True,
    report_to="tensorboard",
    logging_steps=5000,
    max_grad_norm= 0.3,
)


weighted_trainer = get_weighted_trainer(list(class_weights_dict.values()))

trainer = weighted_trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset["test"],
    data_collator=collator,
    compute_metrics=compute_metrics
)
trainer.add_callback(CustomCallback(trainer))


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()


  0%|          | 0/80440 [00:00<?, ?it/s]



{'loss': 0.5001, 'grad_norm': 0.13222931325435638, 'learning_rate': 6.215813028344107e-06, 'epoch': 0.31}
{'loss': 0.1537, 'grad_norm': 0.033327579498291016, 'learning_rate': 9.981999376249983e-06, 'epoch': 0.62}
{'loss': 0.1032, 'grad_norm': 0.027253005653619766, 'learning_rate': 9.773936894297323e-06, 'epoch': 0.93}


  0%|          | 0/16088 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (514795, 46)
{'train_loss': 0.09452324360609055, 'train_accuracy': 0.0005069979312153382, 'train_f1': 0.0011516760596548845, 'train_f1_macro': 0.0001977681730803016, 'train_f1_weighted': 0.001146305407391708, 'train_runtime': 2254.7027, 'train_samples_per_second': 228.321, 'train_steps_per_second': 7.135, 'epoch': 1.0}


  0%|          | 0/2640 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (84453, 46)
{'eval_loss': 0.1275978982448578, 'eval_accuracy': 0.0005446816572531467, 'eval_f1': 0.0013094931286437691, 'eval_f1_macro': 0.0002539858632549303, 'eval_f1_weighted': 0.0013031400674927175, 'eval_runtime': 373.4936, 'eval_samples_per_second': 226.116, 'eval_steps_per_second': 7.068, 'epoch': 1.0}




{'loss': 0.0953, 'grad_norm': 0.026362020522356033, 'learning_rate': 9.342226306376689e-06, 'epoch': 1.24}
{'loss': 0.0904, 'grad_norm': 0.02458866313099861, 'learning_rate': 8.707204390479127e-06, 'epoch': 1.55}
{'loss': 0.0865, 'grad_norm': 0.021311307325959206, 'learning_rate': 7.89819541128146e-06, 'epoch': 1.86}


  0%|          | 0/16088 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (514795, 46)
{'train_loss': 0.0800972729921341, 'train_accuracy': 0.08401985256267058, 'train_f1': 0.1477398112628172, 'train_f1_macro': 0.02499738004377408, 'train_f1_weighted': 0.11126139330932752, 'train_runtime': 2237.4184, 'train_samples_per_second': 230.084, 'train_steps_per_second': 7.19, 'epoch': 2.0}


  0%|          | 0/2640 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (84453, 46)
{'eval_loss': 0.10971024632453918, 'eval_accuracy': 0.07590020484766676, 'eval_f1': 0.1281696813930697, 'eval_f1_macro': 0.02132969714070727, 'eval_f1_weighted': 0.09667334720638457, 'eval_runtime': 366.1311, 'eval_samples_per_second': 230.663, 'eval_steps_per_second': 7.211, 'epoch': 2.0}




{'loss': 0.0835, 'grad_norm': 0.024259725585579872, 'learning_rate': 6.953735531960473e-06, 'epoch': 2.18}
{'loss': 0.0812, 'grad_norm': 0.025461765006184578, 'learning_rate': 5.917555712021059e-06, 'epoch': 2.49}
{'loss': 0.0793, 'grad_norm': 0.027708139270544052, 'learning_rate': 4.83838341546949e-06, 'epoch': 2.8}


  0%|          | 0/16088 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (514795, 46)
{'train_loss': 0.073717400431633, 'train_accuracy': 0.11893860662982352, 'train_f1': 0.20426966610474426, 'train_f1_macro': 0.0374985526451056, 'train_f1_weighted': 0.14797684721116974, 'train_runtime': 2235.3136, 'train_samples_per_second': 230.301, 'train_steps_per_second': 7.197, 'epoch': 3.0}


  0%|          | 0/2640 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (84453, 46)
{'eval_loss': 0.10182597488164902, 'eval_accuracy': 0.1009555610813115, 'eval_f1': 0.17019310493237214, 'eval_f1_macro': 0.030234829015452255, 'eval_f1_weighted': 0.12385600054330675, 'eval_runtime': 366.0317, 'eval_samples_per_second': 230.726, 'eval_steps_per_second': 7.212, 'epoch': 3.0}




{'loss': 0.0779, 'grad_norm': 0.024034636095166206, 'learning_rate': 3.7667837350975524e-06, 'epoch': 3.11}
{'loss': 0.0769, 'grad_norm': 0.027668477967381477, 'learning_rate': 2.7529669454339756e-06, 'epoch': 3.42}
{'loss': 0.0763, 'grad_norm': 0.025287026539444923, 'learning_rate': 1.844435878238826e-06, 'epoch': 3.73}


  0%|          | 0/16088 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (514795, 46)
{'train_loss': 0.07149530202150345, 'train_accuracy': 0.1325576200235045, 'train_f1': 0.22675597701454245, 'train_f1_macro': 0.045804461926193946, 'train_f1_weighted': 0.16510007205464594, 'train_runtime': 2279.8729, 'train_samples_per_second': 225.8, 'train_steps_per_second': 7.057, 'epoch': 4.0}


  0%|          | 0/2640 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (84453, 46)
{'eval_loss': 0.09906462579965591, 'eval_accuracy': 0.1108308763454229, 'eval_f1': 0.18715787100249276, 'eval_f1_macro': 0.036200677077417025, 'eval_f1_weighted': 0.13627970435301434, 'eval_runtime': 372.6552, 'eval_samples_per_second': 226.625, 'eval_steps_per_second': 7.084, 'epoch': 4.0}




{'loss': 0.0759, 'grad_norm': 0.02331838198006153, 'learning_rate': 1.0837601564200822e-06, 'epoch': 4.04}
{'loss': 0.0755, 'grad_norm': 0.022498829290270805, 'learning_rate': 5.065815756216214e-07, 'epoch': 4.35}
{'loss': 0.0754, 'grad_norm': 0.021427739411592484, 'learning_rate': 1.39893122364011e-07, 'epoch': 4.66}
{'loss': 0.0755, 'grad_norm': 0.02103545144200325, 'learning_rate': 1.0222747765559204e-09, 'epoch': 4.97}


  0%|          | 0/16088 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (514795, 46)
{'train_loss': 0.07117845863103867, 'train_accuracy': 0.13442826756281626, 'train_f1': 0.22988089484174798, 'train_f1_macro': 0.04721275371259912, 'train_f1_weighted': 0.16770386538391518, 'train_runtime': 2280.5209, 'train_samples_per_second': 225.736, 'train_steps_per_second': 7.055, 'epoch': 5.0}


  0%|          | 0/2640 [00:00<?, ?it/s]

predictions = [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (84453, 46)
{'eval_loss': 0.09867217391729355, 'eval_accuracy': 0.11205048962144624, 'eval_f1': 0.18938375227504423, 'eval_f1_macro': 0.03723334856105703, 'eval_f1_weighted': 0.13815616834101277, 'eval_runtime': 373.1646, 'eval_samples_per_second': 226.316, 'eval_steps_per_second': 7.075, 'epoch': 5.0}
{'train_runtime': 26471.8524, 'train_samples_per_second': 97.234, 'train_steps_per_second': 3.039, 'train_loss': 0.11272009233525, 'epoch': 5.0}


TrainOutput(global_step=80440, training_loss=0.11272009233525, metrics={'train_runtime': 26471.8524, 'train_samples_per_second': 97.234, 'train_steps_per_second': 3.039, 'train_loss': 0.11272009233525, 'epoch': 5.0})