# Klasyfikacja tekstu za pomocą BERT i GPT2

In [1]:
from transformers import BertTokenizer, \
    BertForSequenceClassification, \
    Trainer, \
    TrainingArguments, \
    DataCollatorWithPadding, \
    pipeline
from peft import PeftModel, \
    PeftConfig, \
    LoraConfig, \
    TaskType, \
    get_peft_model
from datasets import load_dataset
from sklearn.metrics import accuracy_score, \
    f1_score, \
    precision_score, \
    recall_score
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np
import pandas as pd
import torch

## BertForSequenceClassification

In [2]:
dataset = load_dataset(path = 'json', data_files = '../task_1/data/full_text_classification.jsonl')
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 4441
})

In [None]:
class BertSequence():
    def __init__(self, model_name, use_lora = False):
        self.convert_dict = {
            'pozytywny wydźwięk': 0, 
            'neutralny wydźwięk': 1, 
            'negatywny wydźwięk': 2, 
            'mowa nienawiści': 3
        }
        
        self.model_name = model_name
        self.use_lora = use_lora
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self._load_model()
        self.tokenizer = self._load_tokenizer()
        
        self.dataset = None
        self.train_dataset = None
        self.val_dataset = None
        self.train_tokenized = None
        self.val_tokenized = None
        self.trainer = None
        self.sentence_embeddings = None
        self.tsne_df = None
    
    def _load_model(self):
        try:
            base_model = BertForSequenceClassification.from_pretrained(
                self.model_name, num_labels = len(self.convert_dict)
            )
            if self.use_lora:
                lora_config = LoraConfig(task_type = TaskType.SEQ_CLS, 
                                         r = 64, 
                                         lora_alpha = 1, 
                                         lora_dropout = 0.1)
                base_model = get_peft_model(model = base_model, peft_config = lora_config)
            return base_model.to(self.device)
        
        except Exception as e:
            print("Error loading model:", e)
            return None
        
    def _load_tokenizer(self):
        try:
            return BertTokenizer.from_pretrained(self.model_name)
        except Exception as e:
            print("Error loading tokenizer:", e)
            return None

    def load_data(self, dataset):
        self.dataset = dataset
        return self.dataset
    
    def _convert_labels(self, text):
        text['label'] = self.convert_dict.get(text['label'], -1)
        return text

    def split_data(self, test_size = 0.2):  #, sample_size = 100
        if self.dataset is None:
            raise ValueError("Dataset is not loaded.")
        
        dataset_to_split = self.dataset['train'].train_test_split(test_size = test_size)
        self.train_dataset = dataset_to_split['train'].map(self._convert_labels)
        self.val_dataset = dataset_to_split['test'].map(self._convert_labels)
        
        return self.train_dataset, self.val_dataset
    
    def _tokenize(self, batch):
        return self.tokenizer(batch['text'], padding = 'max_length', truncation = True, max_length = 128)

    def tokenize_data(self):
        if not self.train_dataset or not self.val_dataset:
            raise ValueError("Train/validation datasets are not initialized.")
        
        self.train_tokenized = self.train_dataset.map(self._tokenize, batched = True)
        self.val_tokenized = self.val_dataset.map(self._tokenize, batched = True)
        
        self.train_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
        self.val_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
        
        return self.train_tokenized, self.val_tokenized

    def _compute_metrics(self, pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis = 1)
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average = 'weighted')
        precision = precision_score(labels, preds, average = 'weighted', zero_division = np.nan)
        recall = recall_score(labels, preds, average='weighted', zero_division = np.nan)
        return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}
    
    def create_trainer(self, output_dir = './results'):
        if not self.train_tokenized or not self.val_tokenized:
            raise ValueError("Tokenized datasets are not initialized.")
        
        training_args = TrainingArguments(
            output_dir = output_dir,
            eval_strategy = 'epoch',
            learning_rate = 2e-3,
            per_device_train_batch_size = 4,
            per_device_eval_batch_size = 4,
            num_train_epochs = 10,
            weight_decay = 0.01,
            logging_dir = './logs',
        )
        
        self.trainer = Trainer(
            model = self.model,
            args = training_args,
            train_dataset = self.train_tokenized,
            eval_dataset = self.val_tokenized,
            data_collator = DataCollatorWithPadding(tokenizer = self.tokenizer),
            compute_metrics = self._compute_metrics,
        )
        
        return self
    
    def train(self):
        if self.trainer:
            return self.trainer.train()
        else:
            raise ValueError("Trainer is not initialized.")
    
    def evaluate(self):
        if self.trainer:
            return self.trainer.evaluate()
        else:
            raise ValueError("Trainer is not initialized.")

    def _create_embeddings(self):
        inputs = self.tokenizer(self.val_dataset['text'], 
                                padding = True, 
                                truncation = True, 
                                max_length = 128, 
                                return_tensors = 'pt').to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states = True)
            last_hidden_states = outputs.hidden_states[-1]
        self.sentence_embeddings = last_hidden_states[:, 0, :].cpu()
    
    def plot_embeddings(self):
        self._create_embeddings()

        tsne = TSNE(n_components = 2, random_state = 42, perplexity = 2)
        tsne_results = tsne.fit_transform(self.sentence_embeddings)

        self.tsne_df = pd.DataFrame(tsne_results, columns = ['x', 'y'])
        label_mapping = {v: k for k, v in self.convert_dict.items()}
        self.tsne_df['label'] = [label_mapping[label] for label in self.val_dataset['label']]
        self.tsne_df['text'] = self.val_dataset['text']

        color_map = px.colors.qualitative.Vivid[:len(self.convert_dict)]
        
        fig = px.scatter(
            self.tsne_df, x = 'x', y = 'y', color = 'label',
            title = 'Wizualizacja osadzeń SentenceBERT przy użyciu t-SNE',
            labels = {'label': 'Wydźwięk'},
            hover_name = self.tsne_df['text'],
            color_discrete_sequence = color_map
        )
        
        fig.update_traces(marker = dict(size = 10), selector = dict(mode = 'markers'))
        fig.show()

In [10]:
def run_model(model_name, dataset, use_lora = False):
    model = BertSequence(model_name, use_lora)
    model.load_data(dataset)
    train, test = model.split_data()
    train_tokenized, test_tokenized = model.tokenize_data()
    trainer = model.create_trainer()
    trainer.train()
    # metrics = trainer.evaluate()
    # print(metrics)
    model.plot_embeddings()
    
    return None

In [11]:
run_model('dkleczek/bert-base-polish-uncased-v1', dataset, use_lora = True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dkleczek/bert-base-polish-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/531M [00:00<?, ?B/s]

Map:   0%|          | 0/3552 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

Map:   0%|          | 0/3552 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

  0%|          | 0/8880 [00:00<?, ?it/s]

{'loss': 1.1038, 'grad_norm': 4.716344356536865, 'learning_rate': 0.0018873873873873875, 'epoch': 0.56}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 0.897403359413147, 'eval_accuracy': 0.6659167604049494, 'eval_f1': 0.6196321972033542, 'eval_precision': 0.6575931534758528, 'eval_recall': 0.6659167604049494, 'eval_runtime': 5.2934, 'eval_samples_per_second': 167.944, 'eval_steps_per_second': 42.128, 'epoch': 1.0}
{'loss': 0.9533, 'grad_norm': 4.466608047485352, 'learning_rate': 0.001774774774774775, 'epoch': 1.13}
{'loss': 0.8656, 'grad_norm': 4.799319267272949, 'learning_rate': 0.0016621621621621622, 'epoch': 1.69}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 0.9895046353340149, 'eval_accuracy': 0.6231721034870641, 'eval_f1': 0.6254349754450558, 'eval_precision': 0.7107210259566566, 'eval_recall': 0.6231721034870641, 'eval_runtime': 5.3517, 'eval_samples_per_second': 166.116, 'eval_steps_per_second': 41.669, 'epoch': 2.0}
{'loss': 0.8125, 'grad_norm': 3.8702425956726074, 'learning_rate': 0.0015495495495495494, 'epoch': 2.25}
{'loss': 0.7427, 'grad_norm': 7.279901504516602, 'learning_rate': 0.0014369369369369369, 'epoch': 2.82}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.0003235340118408, 'eval_accuracy': 0.6749156355455568, 'eval_f1': 0.6761648688755918, 'eval_precision': 0.7009551758663655, 'eval_recall': 0.6749156355455568, 'eval_runtime': 5.2265, 'eval_samples_per_second': 170.096, 'eval_steps_per_second': 42.667, 'epoch': 3.0}
{'loss': 0.666, 'grad_norm': 5.679377555847168, 'learning_rate': 0.0013243243243243243, 'epoch': 3.38}
{'loss': 0.6437, 'grad_norm': 1.9778218269348145, 'learning_rate': 0.0012117117117117118, 'epoch': 3.94}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.0592702627182007, 'eval_accuracy': 0.6782902137232846, 'eval_f1': 0.6765662713707826, 'eval_precision': 0.681030878698811, 'eval_recall': 0.6782902137232846, 'eval_runtime': 5.2404, 'eval_samples_per_second': 169.643, 'eval_steps_per_second': 42.554, 'epoch': 4.0}
{'loss': 0.5078, 'grad_norm': 4.285781383514404, 'learning_rate': 0.0010990990990990992, 'epoch': 4.5}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.1986024379730225, 'eval_accuracy': 0.6816647919010124, 'eval_f1': 0.6778679155256576, 'eval_precision': 0.678930480804181, 'eval_recall': 0.6816647919010124, 'eval_runtime': 5.2429, 'eval_samples_per_second': 169.561, 'eval_steps_per_second': 42.533, 'epoch': 5.0}
{'loss': 0.5052, 'grad_norm': 0.9894145727157593, 'learning_rate': 0.0009864864864864865, 'epoch': 5.07}
{'loss': 0.3915, 'grad_norm': 4.474998474121094, 'learning_rate': 0.0008738738738738738, 'epoch': 5.63}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.3959490060806274, 'eval_accuracy': 0.6681664791901012, 'eval_f1': 0.6709587775376253, 'eval_precision': 0.6790642209370453, 'eval_recall': 0.6681664791901012, 'eval_runtime': 5.2368, 'eval_samples_per_second': 169.76, 'eval_steps_per_second': 42.583, 'epoch': 6.0}
{'loss': 0.3658, 'grad_norm': 0.08579394221305847, 'learning_rate': 0.0007612612612612613, 'epoch': 6.19}
{'loss': 0.3271, 'grad_norm': 4.710804462432861, 'learning_rate': 0.0006486486486486487, 'epoch': 6.76}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.5685820579528809, 'eval_accuracy': 0.6816647919010124, 'eval_f1': 0.684939817894089, 'eval_precision': 0.6902690211558329, 'eval_recall': 0.6816647919010124, 'eval_runtime': 5.2401, 'eval_samples_per_second': 169.655, 'eval_steps_per_second': 42.557, 'epoch': 7.0}
{'loss': 0.2519, 'grad_norm': 0.055339112877845764, 'learning_rate': 0.000536036036036036, 'epoch': 7.32}
{'loss': 0.2371, 'grad_norm': 0.1736263483762741, 'learning_rate': 0.0004234234234234234, 'epoch': 7.88}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.658768892288208, 'eval_accuracy': 0.6794150731158605, 'eval_f1': 0.6808398636603972, 'eval_precision': 0.6837773292938014, 'eval_recall': 0.6794150731158605, 'eval_runtime': 5.2417, 'eval_samples_per_second': 169.602, 'eval_steps_per_second': 42.543, 'epoch': 8.0}
{'loss': 0.1804, 'grad_norm': 0.2809048295021057, 'learning_rate': 0.0003108108108108108, 'epoch': 8.45}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.786131739616394, 'eval_accuracy': 0.6850393700787402, 'eval_f1': 0.6856838653630779, 'eval_precision': 0.6871661056190472, 'eval_recall': 0.6850393700787402, 'eval_runtime': 5.2394, 'eval_samples_per_second': 169.675, 'eval_steps_per_second': 42.562, 'epoch': 9.0}
{'loss': 0.1866, 'grad_norm': 0.6911212205886841, 'learning_rate': 0.0001981981981981982, 'epoch': 9.01}
{'loss': 0.1471, 'grad_norm': 7.388421535491943, 'learning_rate': 8.558558558558558e-05, 'epoch': 9.57}


  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.8064097166061401, 'eval_accuracy': 0.6827896512935883, 'eval_f1': 0.6843783742826413, 'eval_precision': 0.6869898535708214, 'eval_recall': 0.6827896512935883, 'eval_runtime': 5.2259, 'eval_samples_per_second': 170.115, 'eval_steps_per_second': 42.672, 'epoch': 10.0}
{'train_runtime': 576.8707, 'train_samples_per_second': 61.574, 'train_steps_per_second': 15.393, 'train_loss': 0.5065217576585375, 'epoch': 10.0}


In [None]:
import torch
from transformers import AutoTokenizer, GPT2ForSequenceClassification, GPT2Tokenizer

class GPT2Sequence():
    def __init__(self, model_name, use_lora = False):
        self.convert_dict = {
            'pozytywny wydźwięk': 0, 
            'neutralny wydźwięk': 1, 
            'negatywny wydźwięk': 2, 
            'mowa nienawiści': 3
        }
        
        self.model_name = model_name
        self.use_lora = use_lora
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self._load_model()
        self.tokenizer = self._load_tokenizer()
        
        self.dataset = None
        self.train_dataset = None
        self.val_dataset = None
        self.train_tokenized = None
        self.val_tokenized = None
        self.trainer = None
        self.sentence_embeddings = None
        self.tsne_df = None
    
    def _load_model(self):
        try:
            base_model = GPT2ForSequenceClassification.from_pretrained(
                self.model_name, num_labels = len(self.convert_dict)
            )
            if self.use_lora:
                lora_config = LoraConfig(task_type = TaskType.SEQ_CLS, 
                                         r = 64, 
                                         lora_alpha = 1, 
                                         lora_dropout = 0.1)
                base_model = get_peft_model(model = base_model, peft_config = lora_config)
            return base_model.to(self.device)
        
        except Exception as e:
            print("Error loading model:", e)
            return None
        
    def _load_tokenizer(self):
        try:
            return GPT2Tokenizer.from_pretrained(self.model_name)
        except Exception as e:
            print("Error loading tokenizer:", e)
            return None

    def load_data(self, dataset):
        self.dataset = dataset
        return self.dataset
    
    def _convert_labels(self, text):
        text['label'] = self.convert_dict.get(text['label'], -1)
        return text

    def split_data(self, test_size = 0.2):  #, sample_size = 100
        if self.dataset is None:
            raise ValueError("Dataset is not loaded.")
        
        dataset_to_split = self.dataset['train'].train_test_split(test_size = test_size)
        self.train_dataset = dataset_to_split['train'].map(self._convert_labels)
        self.val_dataset = dataset_to_split['test'].map(self._convert_labels)
        
        return self.train_dataset, self.val_dataset
    
    def _tokenize(self, batch):
        return self.tokenizer(batch['text'], padding = 'max_length', truncation = True, max_length = 128)

    def tokenize_data(self):
        if not self.train_dataset or not self.val_dataset:
            raise ValueError("Train/validation datasets are not initialized.")
        
        self.train_tokenized = self.train_dataset.map(self._tokenize, batched = True)
        self.val_tokenized = self.val_dataset.map(self._tokenize, batched = True)
        
        self.train_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
        self.val_tokenized.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
        
        return self.train_tokenized, self.val_tokenized

    def _compute_metrics(self, pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis = 1)
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average = 'weighted')
        precision = precision_score(labels, preds, average = 'weighted', zero_division = np.nan)
        recall = recall_score(labels, preds, average='weighted', zero_division = np.nan)
        return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}
    
    def create_trainer(self, output_dir = './results_gpt'):
        if not self.train_tokenized or not self.val_tokenized:
            raise ValueError("Tokenized datasets are not initialized.")
        
        training_args = TrainingArguments(
            output_dir = output_dir,
            eval_strategy = 'epoch',
            learning_rate = 2e-3,
            per_device_train_batch_size = 4,
            per_device_eval_batch_size = 4,
            num_train_epochs = 10,
            weight_decay = 0.01,
            logging_dir = './logs',
        )
        
        self.trainer = Trainer(
            model = self.model,
            args = training_args,
            train_dataset = self.train_tokenized,
            eval_dataset = self.val_tokenized,
            data_collator = DataCollatorWithPadding(tokenizer = self.tokenizer),
            compute_metrics = self._compute_metrics,
        )
        
        return self
    
    def train(self):
        if self.trainer:
            return self.trainer.train()
        else:
            raise ValueError("Trainer is not initialized.")
    
    def evaluate(self):
        if self.trainer:
            return self.trainer.evaluate()
        else:
            raise ValueError("Trainer is not initialized.")

    def _create_embeddings(self):
        inputs = self.tokenizer(self.val_dataset['text'], 
                                padding = True, 
                                truncation = True, 
                                max_length = 128, 
                                return_tensors = 'pt').to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states = True)
            last_hidden_states = outputs.hidden_states[-1]
        self.sentence_embeddings = last_hidden_states[:, 0, :].cpu()
    
    def plot_embeddings(self):
        self._create_embeddings()

        tsne = TSNE(n_components = 2, random_state = 42, perplexity = 2)
        tsne_results = tsne.fit_transform(self.sentence_embeddings)

        self.tsne_df = pd.DataFrame(tsne_results, columns = ['x', 'y'])
        label_mapping = {v: k for k, v in self.convert_dict.items()}
        self.tsne_df['label'] = [label_mapping[label] for label in self.val_dataset['label']]
        self.tsne_df['text'] = self.val_dataset['text']

        color_map = px.colors.qualitative.Vivid[:len(self.convert_dict)]
        
        fig = px.scatter(
            self.tsne_df, x = 'x', y = 'y', color = 'label',
            title = 'Wizualizacja osadzeń SentenceGPT2 przy użyciu t-SNE',
            labels = {'label': 'Wydźwięk'},
            hover_name = self.tsne_df['text'],
            color_discrete_sequence = color_map
        )
        
        fig.update_traces(marker = dict(size = 10), selector = dict(mode = 'markers'))
        fig.show()

In [4]:
def run_model2(model_name, dataset, use_lora = False):
    model = GPT2Sequence(model_name, use_lora)
    model.load_data(dataset)
    train, test = model.split_data()
    train_tokenized, test_tokenized = model.tokenize_data()
    trainer = model.create_trainer()
    trainer.train()
    # metrics = trainer.evaluate()
    # print(metrics)
    model.plot_embeddings()
    
    return None

In [5]:
torch.cuda.empty_cache()

In [6]:
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [7]:
run_model2('nie3e/sentiment-polish-gpt2-small', dataset, use_lora = True)



Map:   0%|          | 0/3552 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

Map:   0%|          | 0/3552 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

  0%|          | 0/17760 [00:00<?, ?it/s]

{'loss': 1.4975, 'grad_norm': 5.716457366943359, 'learning_rate': 0.0019436936936936937, 'epoch': 0.28}
{'loss': 1.2258, 'grad_norm': 2.6070051193237305, 'learning_rate': 0.0018873873873873875, 'epoch': 0.56}
{'loss': 1.2752, 'grad_norm': 14.797423362731934, 'learning_rate': 0.0018310810810810811, 'epoch': 0.84}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.2143986225128174, 'eval_accuracy': 0.6197975253093363, 'eval_f1': 0.6019116341570533, 'eval_precision': 0.6498644058861676, 'eval_recall': 0.6197975253093363, 'eval_runtime': 7.2922, 'eval_samples_per_second': 121.911, 'eval_steps_per_second': 61.024, 'epoch': 1.0}
{'loss': 1.1787, 'grad_norm': 12.69900894165039, 'learning_rate': 0.001774774774774775, 'epoch': 1.13}
{'loss': 1.1346, 'grad_norm': 5.0469207763671875, 'learning_rate': 0.0017184684684684686, 'epoch': 1.41}
{'loss': 1.0993, 'grad_norm': 1.8780643939971924, 'learning_rate': 0.0016621621621621622, 'epoch': 1.69}
{'loss': 1.1079, 'grad_norm': 13.272486686706543, 'learning_rate': 0.0016058558558558558, 'epoch': 1.97}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.177727222442627, 'eval_accuracy': 0.6524184476940382, 'eval_f1': 0.6323884820554004, 'eval_precision': 0.639809183118959, 'eval_recall': 0.6524184476940382, 'eval_runtime': 7.4568, 'eval_samples_per_second': 119.22, 'eval_steps_per_second': 59.677, 'epoch': 2.0}
{'loss': 1.0295, 'grad_norm': 14.296765327453613, 'learning_rate': 0.0015495495495495494, 'epoch': 2.25}
{'loss': 1.0086, 'grad_norm': 0.08649056404829025, 'learning_rate': 0.0014932432432432433, 'epoch': 2.53}
{'loss': 1.0499, 'grad_norm': 14.359973907470703, 'learning_rate': 0.0014369369369369369, 'epoch': 2.82}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.0979920625686646, 'eval_accuracy': 0.6670416197975253, 'eval_f1': 0.6349533173882673, 'eval_precision': 0.6649527624510556, 'eval_recall': 0.6670416197975253, 'eval_runtime': 7.0626, 'eval_samples_per_second': 125.874, 'eval_steps_per_second': 63.008, 'epoch': 3.0}
{'loss': 0.9613, 'grad_norm': 0.6425528526306152, 'learning_rate': 0.0013806306306306307, 'epoch': 3.1}
{'loss': 0.9622, 'grad_norm': 0.7175297737121582, 'learning_rate': 0.0013243243243243243, 'epoch': 3.38}
{'loss': 1.0186, 'grad_norm': 13.689398765563965, 'learning_rate': 0.0012680180180180182, 'epoch': 3.66}
{'loss': 0.9303, 'grad_norm': 1.7547354698181152, 'learning_rate': 0.0012117117117117118, 'epoch': 3.94}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.4888936281204224, 'eval_accuracy': 0.6647919010123734, 'eval_f1': 0.66077260035795, 'eval_precision': 0.675388642486584, 'eval_recall': 0.6647919010123734, 'eval_runtime': 7.4681, 'eval_samples_per_second': 119.04, 'eval_steps_per_second': 59.587, 'epoch': 4.0}
{'loss': 0.7829, 'grad_norm': 16.040124893188477, 'learning_rate': 0.0011554054054054054, 'epoch': 4.22}
{'loss': 0.8378, 'grad_norm': 0.5160068273544312, 'learning_rate': 0.0010990990990990992, 'epoch': 4.5}
{'loss': 0.7974, 'grad_norm': 13.343104362487793, 'learning_rate': 0.0010427927927927926, 'epoch': 4.79}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.489951252937317, 'eval_accuracy': 0.6647919010123734, 'eval_f1': 0.6606585319040607, 'eval_precision': 0.6636594953296536, 'eval_recall': 0.6647919010123734, 'eval_runtime': 7.3163, 'eval_samples_per_second': 121.51, 'eval_steps_per_second': 60.823, 'epoch': 5.0}
{'loss': 0.8085, 'grad_norm': 0.5473710894584656, 'learning_rate': 0.0009864864864864865, 'epoch': 5.07}
{'loss': 0.6941, 'grad_norm': 0.2703695595264435, 'learning_rate': 0.0009301801801801802, 'epoch': 5.35}
{'loss': 0.7438, 'grad_norm': 0.15790234506130219, 'learning_rate': 0.0008738738738738738, 'epoch': 5.63}
{'loss': 0.7969, 'grad_norm': 0.012339428067207336, 'learning_rate': 0.0008175675675675675, 'epoch': 5.91}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.3958334922790527, 'eval_accuracy': 0.6692913385826772, 'eval_f1': 0.6682472876231125, 'eval_precision': 0.6718663316877177, 'eval_recall': 0.6692913385826772, 'eval_runtime': 7.261, 'eval_samples_per_second': 122.434, 'eval_steps_per_second': 61.286, 'epoch': 6.0}
{'loss': 0.5798, 'grad_norm': 0.05864034965634346, 'learning_rate': 0.0007612612612612613, 'epoch': 6.19}
{'loss': 0.6563, 'grad_norm': 0.4908212423324585, 'learning_rate': 0.000704954954954955, 'epoch': 6.48}
{'loss': 0.5281, 'grad_norm': 0.00815130490809679, 'learning_rate': 0.0006486486486486487, 'epoch': 6.76}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 1.8457720279693604, 'eval_accuracy': 0.6692913385826772, 'eval_f1': 0.6648587520541915, 'eval_precision': 0.661917724130867, 'eval_recall': 0.6692913385826772, 'eval_runtime': 7.2068, 'eval_samples_per_second': 123.356, 'eval_steps_per_second': 61.747, 'epoch': 7.0}
{'loss': 0.5747, 'grad_norm': 0.7912495732307434, 'learning_rate': 0.0005923423423423423, 'epoch': 7.04}
{'loss': 0.4101, 'grad_norm': 6.486070156097412, 'learning_rate': 0.000536036036036036, 'epoch': 7.32}
{'loss': 0.5143, 'grad_norm': 7.916805744171143, 'learning_rate': 0.00047972972972972974, 'epoch': 7.6}
{'loss': 0.5061, 'grad_norm': 16.732786178588867, 'learning_rate': 0.0004234234234234234, 'epoch': 7.88}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 2.154465436935425, 'eval_accuracy': 0.6591676040494938, 'eval_f1': 0.6601210023854296, 'eval_precision': 0.6666453581072639, 'eval_recall': 0.6591676040494938, 'eval_runtime': 7.3117, 'eval_samples_per_second': 121.587, 'eval_steps_per_second': 60.862, 'epoch': 8.0}
{'loss': 0.3873, 'grad_norm': 0.002278523053973913, 'learning_rate': 0.00036711711711711714, 'epoch': 8.16}
{'loss': 0.358, 'grad_norm': 0.00018282837118022144, 'learning_rate': 0.0003108108108108108, 'epoch': 8.45}
{'loss': 0.3667, 'grad_norm': 3.6958768367767334, 'learning_rate': 0.00025450450450450454, 'epoch': 8.73}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 2.6419312953948975, 'eval_accuracy': 0.6659167604049494, 'eval_f1': 0.6684250070769266, 'eval_precision': 0.6789349671420387, 'eval_recall': 0.6659167604049494, 'eval_runtime': 7.2279, 'eval_samples_per_second': 122.996, 'eval_steps_per_second': 61.567, 'epoch': 9.0}
{'loss': 0.3624, 'grad_norm': 0.01617749035358429, 'learning_rate': 0.0001981981981981982, 'epoch': 9.01}
{'loss': 0.3263, 'grad_norm': 0.00013825162022840232, 'learning_rate': 0.00014189189189189188, 'epoch': 9.29}
{'loss': 0.2924, 'grad_norm': 18.87890625, 'learning_rate': 8.558558558558558e-05, 'epoch': 9.57}
{'loss': 0.2645, 'grad_norm': 0.0017349281115457416, 'learning_rate': 2.927927927927928e-05, 'epoch': 9.85}


  0%|          | 0/445 [00:00<?, ?it/s]

{'eval_loss': 2.5819807052612305, 'eval_accuracy': 0.672665916760405, 'eval_f1': 0.6725826501568952, 'eval_precision': 0.6746002486912401, 'eval_recall': 0.672665916760405, 'eval_runtime': 7.1948, 'eval_samples_per_second': 123.561, 'eval_steps_per_second': 61.85, 'epoch': 10.0}
{'train_runtime': 863.5818, 'train_samples_per_second': 41.131, 'train_steps_per_second': 20.566, 'train_loss': 0.7655113729270728, 'epoch': 10.0}
