#### Imports


In [26]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from huggingface_hub import login
import logging
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    # tokenizers
    AutoTokenizer,
    DebertaV2Tokenizer,
    DistilBertTokenizer,
    BertTokenizer,
    RobertaTokenizer,
    ElectraTokenizer,
    AlbertTokenizer,
    XLNetTokenizer,
    MobileBertTokenizer,
    # models
    DebertaV2ForSequenceClassification,
    DistilBertForSequenceClassification,
    BertForSequenceClassification,
    RobertaForSequenceClassification,
    ElectraForSequenceClassification,
    AlbertForSequenceClassification,
    XLNetForSequenceClassification,
    MobileBertForSequenceClassification,
)
from torch.nn import CrossEntropyLoss
# evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from collections import Counter

import transformers
print(transformers.__version__)
print(transformers.TrainingArguments)

# Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

4.29.2
<class 'transformers.training_args.TrainingArguments'>


In [27]:
logging.basicConfig(filename='classification.log', level=logging.INFO)
logging.info(f"Running on device: {device}")

In [28]:
# setting huggingface token
login(token=os.getenv("HUGGINGFACE_TOKEN"))

os.environ["HF_HOME"] = "D:/huggingface_cache" 
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "D:/huggingface_cache"

print("HF_HOME:", os.getenv("HF_HOME"))
print("TRANSFORMERS_CACHE:", os.getenv("TRANSFORMERS_CACHE"))
print("HUGGINGFACE_HUB_CACHE:", os.getenv("HUGGINGFACE_HUB_CACHE"))

logging.info(f"HF_HOME: {os.getenv('HF_HOME')}")
logging.info(f"TRANSFORMERS_CACHE: {os.getenv('TRANSFORMERS_CACHE')}")
logging.info(f"HUGGINGFACE_HUB_CACHE: {os.getenv('HUGGINGFACE_HUB_CACHE')}")

transformers.utils.hub.TRANSFORMERS_CACHE = "D:/huggingface_cache"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

HF_HOME: D:/huggingface_cache
TRANSFORMERS_CACHE: D:/huggingface_cache
HUGGINGFACE_HUB_CACHE: D:/huggingface_cache


### LOADING SQLITE DB WITH RECORDS



In [29]:
import sqlite3
import json
import pandas as pd

DB_FILE = "chunks.db"
OUTPUT_FILE = "exported_chunks.jsonl"

# Connect to the database
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

# Query all data from chunks table
cur.execute("SELECT text, label FROM chunks")
rows = cur.fetchall()

# Write to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for text, label in rows:
        obj = {"text": text}
        if label is not None:
            obj["label"] = label
        f.write(json.dumps(obj) + "\n")

conn.close()

print("Data exported to JSONL file.")


Data exported to JSONL file.


In [30]:
# Load the labeled chunks
with open("exported_chunks.jsonl", "r", encoding="utf-8") as f:
    labeled_chunks = [json.loads(line) for line in f]

data = pd.DataFrame(labeled_chunks)
labeled_count = data['label'].value_counts().to_dict()

# Get the first 9000 rows
data = data.head(9000)

# Remove rows with label == 11
data = data[data['label'] != 11]

# Print labeled count after removing label 11
labeled_count = data['label'].value_counts().to_dict()
print(f"Labeled chunks after removing label 11: {labeled_count}")

# Remove rows where label == 1 and text length < 100
data = data[~((data['label'] == 1) & (data['text'].str.len() < 100))]

# Print final labeled count
labeled_count = data['label'].value_counts().to_dict()
print(f"Final labeled chunks: {labeled_count}")


Labeled chunks after removing label 11: {1: 8199, 0: 800}
Final labeled chunks: {1: 5384, 0: 800}


##### Spliting data


In [31]:
# Loading the data
data['label'] = data['label'].astype(int)

# Train-Test Split using stratified sampling
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# since there is a class imbalance, we will compute class weights
# to handle this in the loss function
labels = train_df["label"].values
# Compute class weights
classes = np.unique(labels)
weights = compute_class_weight(class_weight="balanced",
                            classes=classes,
                            y=labels)
class_weights = torch.tensor(weights, dtype=torch.float, device=device)
print("Class weights:", class_weights)

# Convert ing the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

Class weights: tensor([3.8648, 0.5743], device='cuda:0')


In [32]:

# ---- Tuning parameters ----
CONFIG = {
    "epochs": 2,
    "batch_size": 16,
    "max_length": [128, 256, 512], # Max length of input sequences
    "learning_rate": 5e-5, # Learning rate for the optimizer
    "weight_decay": 0.01, # Weight decay for regularization
    "output_dir": "D:/huggingface_cache/classification_models"
}

# ---- Model configurations ----
MODEL_CONFIGS = {
    "deberta": {
        "tokenizer_class": DebertaV2Tokenizer,
        "pretrained_model_name": "microsoft/deberta-v3-small", # params 55M
        "model_class": DebertaV2ForSequenceClassification
    },
    "distilbert": {
        "tokenizer_class": DistilBertTokenizer,
        "pretrained_model_name": "distilbert-base-uncased", # params 66M
        "model_class": DistilBertForSequenceClassification
    },
    "bert": {
        "tokenizer_class": BertTokenizer,
        "pretrained_model_name": "bert-base-uncased", # params 110M
        "model_class": BertForSequenceClassification
    },
    "roberta": {
        "tokenizer_class": RobertaTokenizer,
        "pretrained_model_name": "roberta-base", # params 125M
        "model_class": RobertaForSequenceClassification
    },
    "electra": {
        "tokenizer_class": ElectraTokenizer,
        "pretrained_model_name": "google/electra-small-discriminator", # params 14M
        "model_class": ElectraForSequenceClassification
    },
    "albert": {
        "tokenizer_class": AlbertTokenizer,
        "pretrained_model_name": "albert-base-v2", # params 11M
        "model_class": AlbertForSequenceClassification
    },
    "xlnet": {
        "tokenizer_class": XLNetTokenizer,
        "pretrained_model_name": "xlnet-base-cased", # params 110M
        "model_class": XLNetForSequenceClassification
    },
    "mobilebert": {
        "tokenizer_class": AutoTokenizer,
        "pretrained_model_name": "google/mobilebert-uncased", # params 25M
        "model_class": MobileBertForSequenceClassification
    },
    "albert-base-v1": {
        "tokenizer_class": AlbertTokenizer,
        "pretrained_model_name": "albert-base-v1", # params 12M
        "model_class": AlbertForSequenceClassification
    },
    "albert-large-v2": {
        "tokenizer_class": AlbertTokenizer,
        "pretrained_model_name": "albert-large-v2", # params 18M
        "model_class": AlbertForSequenceClassification
    },
    "albert-xlarge-v2": {
        "tokenizer_class": AlbertTokenizer,
        "pretrained_model_name": "albert-xlarge-v2", # params 60M
        "model_class": AlbertForSequenceClassification
    },
    "albert-xxlarge-v2": {
        "tokenizer_class": AlbertTokenizer,
        "pretrained_model_name": "albert-xxlarge-v2", # params 235M
        "model_class": AlbertForSequenceClassification
    },
    "bert-large-uncased": {
        "tokenizer_class": BertTokenizer,
        "pretrained_model_name": "bert-large-uncased", # params 340M
        "model_class": BertForSequenceClassification
    }
}


In [33]:

# ---- Metric function ----
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}




In [34]:
# ---- Weighted Trainer ----
class WeightedLossTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss



In [35]:

# ---- Function to train and evaluate ----
def train_and_evaluate(model_name, train_dataset, test_dataset, data, max_length):
    print(f"\n===== Training {model_name} =====")

    # Model + tokenizer
    cfg = MODEL_CONFIGS[model_name]
    tokenizer = cfg["tokenizer_class"].from_pretrained(cfg["pretrained_model_name"])
    model = cfg["model_class"].from_pretrained(
        cfg["pretrained_model_name"],
        num_labels=len(data['label'].unique())
    )

    # Tokenization
    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_length)

    train_enc = train_dataset.map(tokenize_fn, batched=True)
    test_enc = test_dataset.map(tokenize_fn, batched=True)
    train_enc = train_enc.rename_column("label", "labels")
    test_enc = test_enc.rename_column("label", "labels")
    train_enc.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    test_enc.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Training args
    training_args = TrainingArguments(
        output_dir=f"{CONFIG['output_dir']}/{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CONFIG["learning_rate"],
        per_device_train_batch_size=CONFIG["batch_size"],
        per_device_eval_batch_size=CONFIG["batch_size"],
        num_train_epochs=CONFIG["epochs"],
        weight_decay=CONFIG["weight_decay"],
        logging_dir=f"./logs_{model_name}",
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        seed=42
    )

    # Trainer
    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_enc,
        eval_dataset=test_enc,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        class_weights=class_weights
    )

    # Train + evaluate
    # time to train
    t_train_start = torch.cuda.Event(enable_timing=True)
    t_train_end = torch.cuda.Event(enable_timing=True)
    t_train_start.record()
    trainer.train()
    t_train_end.record()
    torch.cuda.synchronize()
    t_train_time = t_train_start.elapsed_time(t_train_end) / 1000  # convert to seconds
    print(f"Training time for {model_name}: {t_train_time:.2f} seconds")

    # Predictions
    t_pred_start = torch.cuda.Event(enable_timing=True)
    t_pred_end = torch.cuda.Event(enable_timing=True)
    t_pred_start.record()
    preds = trainer.predict(test_enc)
    t_pred_end.record()
    torch.cuda.synchronize()
    t_pred_time = t_pred_start.elapsed_time(t_pred_end) / 1000  # convert to seconds
    print(f"Prediction time for {model_name}: {t_pred_time:.2f} seconds")
    y_true = preds.label_ids
    y_pred = np.argmax(preds.predictions, axis=1)

    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_true, y_pred, digits=4))

    performance = {
        "model": model_name,
        "max_length": max_length,
        "train_time_sec": t_train_time,
        "pred_time_sec": t_pred_time,
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": precision_recall_fscore_support(y_true, y_pred, average='weighted')[2],
        "precision": precision_recall_fscore_support(y_true, y_pred, average='weighted')[0],
        "recall": precision_recall_fscore_support(y_true, y_pred, average='weighted')[1],
    }
    return performance

In [36]:
for model_name in MODEL_CONFIGS.keys():
    for max_len in CONFIG["max_length"]:
        print(f"\n--- Training {model_name} with max_length={max_len} ---")
        performance = train_and_evaluate(model_name, train_dataset, test_dataset, data,max_len)
        print(performance)
        # save performance to a csv file
        with open("model_performance.csv", "a") as f:
            #headers if file is empty
            if os.stat("model_performance.csv").st_size == 0:
                f.write("model,max_length,train_time_sec,pred_time_sec,accuracy,f1,precision,recall\n")
            f.write(",".join([str(performance[k]) for k in performance]) + "\n")





--- Training deberta with max_length=128 ---

===== Training deberta =====


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the c

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.380266010761261, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9719901040015049, 'eval_precision': 0.9741156678100898, 'eval_recall': 0.973322554567502, 'eval_runtime': 1.3136, 'eval_samples_per_second': 941.674, 'eval_steps_per_second': 59.378, 'epoch': 1.0}
{'loss': 0.2723, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.23579472303390503, 'eval_accuracy': 0.9765561843168957, 'eval_f1': 0.9763972689014709, 'eval_precision': 0.9763067594594299, 'eval_recall': 0.9765561843168957, 'eval_runtime': 1.3274, 'eval_samples_per_second': 931.909, 'eval_steps_per_second': 58.762, 'epoch': 2.0}
{'train_runtime': 70.8153, 'train_samples_per_second': 139.716, 'train_steps_per_second': 8.755, 'train_loss': 0.24663084399315618, 'epoch': 2.0}
Training time for deberta: 70.82 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for deberta: 1.41 seconds

Classification Report for deberta:
              precision    recall  f1-score   support

           0     0.9226    0.8938    0.9079       160
           1     0.9843    0.9889    0.9866      1077

    accuracy                         0.9766      1237
   macro avg     0.9534    0.9413    0.9473      1237
weighted avg     0.9763    0.9766    0.9764      1237

{'model': 'deberta', 'max_length': 128, 'train_time_sec': 70.8225390625, 'pred_time_sec': 1.4123701171875, 'accuracy': 0.9765561843168957, 'f1': 0.9763972689014709, 'precision': 0.9763067594594299, 'recall': 0.9765561843168957}

--- Training deberta with max_length=256 ---

===== Training deberta =====


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the c

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3137158155441284, 'eval_accuracy': 0.9717057396928052, 'eval_f1': 0.9713552018317003, 'eval_precision': 0.9712267469724357, 'eval_recall': 0.9717057396928052, 'eval_runtime': 2.9978, 'eval_samples_per_second': 412.638, 'eval_steps_per_second': 26.019, 'epoch': 1.0}
{'loss': 0.2672, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2729227542877197, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9729154851070881, 'eval_precision': 0.9728458738348225, 'eval_recall': 0.973322554567502, 'eval_runtime': 3.0165, 'eval_samples_per_second': 410.077, 'eval_steps_per_second': 25.858, 'epoch': 2.0}
{'train_runtime': 109.709, 'train_samples_per_second': 90.184, 'train_steps_per_second': 5.651, 'train_loss': 0.2453546400993101, 'epoch': 2.0}
Training time for deberta: 109.72 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for deberta: 2.99 seconds

Classification Report for deberta:
              precision    recall  f1-score   support

           0     0.9262    0.8625    0.8932       160
           1     0.9798    0.9898    0.9848      1077

    accuracy                         0.9733      1237
   macro avg     0.9530    0.9261    0.9390      1237
weighted avg     0.9728    0.9733    0.9729      1237

{'model': 'deberta', 'max_length': 256, 'train_time_sec': 109.7162265625, 'pred_time_sec': 2.991547119140625, 'accuracy': 0.973322554567502, 'f1': 0.9729154851070881, 'precision': 0.9728458738348225, 'recall': 0.973322554567502}

--- Training deberta with max_length=512 ---

===== Training deberta =====


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the c

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.285459041595459, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9701506253747769, 'eval_precision': 0.9703338646289537, 'eval_recall': 0.9708973322554567, 'eval_runtime': 8.9638, 'eval_samples_per_second': 137.999, 'eval_steps_per_second': 8.702, 'epoch': 1.0}
{'loss': 0.294, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.25055503845214844, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9729920474413175, 'eval_precision': 0.9728850455627258, 'eval_recall': 0.973322554567502, 'eval_runtime': 8.968, 'eval_samples_per_second': 137.934, 'eval_steps_per_second': 8.698, 'epoch': 2.0}
{'train_runtime': 241.1898, 'train_samples_per_second': 41.022, 'train_steps_per_second': 2.571, 'train_loss': 0.2680034452869046, 'epoch': 2.0}
Training time for deberta: 241.20 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for deberta: 8.90 seconds

Classification Report for deberta:
              precision    recall  f1-score   support

           0     0.9205    0.8688    0.8939       160
           1     0.9807    0.9889    0.9847      1077

    accuracy                         0.9733      1237
   macro avg     0.9506    0.9288    0.9393      1237
weighted avg     0.9729    0.9733    0.9730      1237

{'model': 'deberta', 'max_length': 512, 'train_time_sec': 241.19775, 'pred_time_sec': 8.9031904296875, 'accuracy': 0.973322554567502, 'f1': 0.9729920474413175, 'precision': 0.9728850455627258, 'recall': 0.973322554567502}

--- Training distilbert with max_length=128 ---

===== Training distilbert =====


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.we

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.27784067392349243, 'eval_accuracy': 0.978981406628941, 'eval_f1': 0.978505773652584, 'eval_precision': 0.9787808045597567, 'eval_recall': 0.978981406628941, 'eval_runtime': 0.9318, 'eval_samples_per_second': 1327.522, 'eval_steps_per_second': 83.708, 'epoch': 1.0}
{'loss': 0.2407, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2534365952014923, 'eval_accuracy': 0.978981406628941, 'eval_f1': 0.9786909612760256, 'eval_precision': 0.9786841229126367, 'eval_recall': 0.978981406628941, 'eval_runtime': 0.9423, 'eval_samples_per_second': 1312.69, 'eval_steps_per_second': 82.773, 'epoch': 2.0}
{'train_runtime': 34.9018, 'train_samples_per_second': 283.481, 'train_steps_per_second': 17.764, 'train_loss': 0.21823013213373, 'epoch': 2.0}
Training time for distilbert: 34.91 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for distilbert: 1.09 seconds

Classification Report for distilbert:
              precision    recall  f1-score   support

           0     0.9467    0.8875    0.9161       160
           1     0.9834    0.9926    0.9880      1077

    accuracy                         0.9790      1237
   macro avg     0.9651    0.9400    0.9521      1237
weighted avg     0.9787    0.9790    0.9787      1237

{'model': 'distilbert', 'max_length': 128, 'train_time_sec': 34.908421875, 'pred_time_sec': 1.0913233642578124, 'accuracy': 0.978981406628941, 'f1': 0.9786909612760256, 'precision': 0.9786841229126367, 'recall': 0.978981406628941}

--- Training distilbert with max_length=256 ---

===== Training distilbert =====


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.we

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2508147656917572, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9729154851070881, 'eval_precision': 0.9728458738348225, 'eval_recall': 0.973322554567502, 'eval_runtime': 1.8243, 'eval_samples_per_second': 678.065, 'eval_steps_per_second': 42.756, 'epoch': 1.0}
{'loss': 0.241, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.23054678738117218, 'eval_accuracy': 0.9765561843168957, 'eval_f1': 0.9762657386605518, 'eval_precision': 0.9762016427433062, 'eval_recall': 0.9765561843168957, 'eval_runtime': 1.8402, 'eval_samples_per_second': 672.206, 'eval_steps_per_second': 42.386, 'epoch': 2.0}
{'train_runtime': 55.4104, 'train_samples_per_second': 178.559, 'train_steps_per_second': 11.189, 'train_loss': 0.21550348343387726, 'epoch': 2.0}
Training time for distilbert: 55.42 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for distilbert: 2.01 seconds

Classification Report for distilbert:
              precision    recall  f1-score   support

           0     0.9338    0.8812    0.9068       160
           1     0.9825    0.9907    0.9866      1077

    accuracy                         0.9766      1237
   macro avg     0.9581    0.9360    0.9467      1237
weighted avg     0.9762    0.9766    0.9763      1237

{'model': 'distilbert', 'max_length': 256, 'train_time_sec': 55.41671484375, 'pred_time_sec': 2.009479248046875, 'accuracy': 0.9765561843168957, 'f1': 0.9762657386605518, 'precision': 0.9762016427433062, 'recall': 0.9765561843168957}

--- Training distilbert with max_length=512 ---

===== Training distilbert =====


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.we

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.26923197507858276, 'eval_accuracy': 0.9741309620048505, 'eval_f1': 0.9734672225553573, 'eval_precision': 0.9737458649639834, 'eval_recall': 0.9741309620048505, 'eval_runtime': 4.7885, 'eval_samples_per_second': 258.325, 'eval_steps_per_second': 16.289, 'epoch': 1.0}
{'loss': 0.2293, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.22618120908737183, 'eval_accuracy': 0.9757477768795473, 'eval_f1': 0.9756167583702973, 'eval_precision': 0.9755291603755218, 'eval_recall': 0.9757477768795473, 'eval_runtime': 4.6798, 'eval_samples_per_second': 264.329, 'eval_steps_per_second': 16.668, 'epoch': 2.0}
{'train_runtime': 122.7312, 'train_samples_per_second': 80.615, 'train_steps_per_second': 5.052, 'train_loss': 0.2063112074329007, 'epoch': 2.0}
Training time for distilbert: 122.74 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for distilbert: 4.66 seconds

Classification Report for distilbert:
              precision    recall  f1-score   support

           0     0.9167    0.8938    0.9051       160
           1     0.9843    0.9879    0.9861      1077

    accuracy                         0.9757      1237
   macro avg     0.9505    0.9408    0.9456      1237
weighted avg     0.9755    0.9757    0.9756      1237

{'model': 'distilbert', 'max_length': 512, 'train_time_sec': 122.737953125, 'pred_time_sec': 4.6627470703125, 'accuracy': 0.9757477768795473, 'f1': 0.9756167583702973, 'precision': 0.9755291603755218, 'recall': 0.9757477768795473}

--- Training bert with max_length=128 ---

===== Training bert =====


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2551727294921875, 'eval_accuracy': 0.9765561843168957, 'eval_f1': 0.9759903078361688, 'eval_precision': 0.9762684861023856, 'eval_recall': 0.9765561843168957, 'eval_runtime': 1.7192, 'eval_samples_per_second': 719.516, 'eval_steps_per_second': 45.37, 'epoch': 1.0}
{'loss': 0.2524, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.15620189905166626, 'eval_accuracy': 0.9797898140662894, 'eval_f1': 0.9797628257971375, 'eval_precision': 0.9797385510679781, 'eval_recall': 0.9797898140662894, 'eval_runtime': 1.7177, 'eval_samples_per_second': 720.155, 'eval_steps_per_second': 45.41, 'epoch': 2.0}
{'train_runtime': 60.2796, 'train_samples_per_second': 164.135, 'train_steps_per_second': 10.285, 'train_loss': 0.22457566876565258, 'epoch': 2.0}
Training time for bert: 60.29 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for bert: 1.89 seconds

Classification Report for bert:
              precision    recall  f1-score   support

           0     0.9245    0.9187    0.9216       160
           1     0.9879    0.9889    0.9884      1077

    accuracy                         0.9798      1237
   macro avg     0.9562    0.9538    0.9550      1237
weighted avg     0.9797    0.9798    0.9798      1237

{'model': 'bert', 'max_length': 128, 'train_time_sec': 60.2867421875, 'pred_time_sec': 1.89179541015625, 'accuracy': 0.9797898140662894, 'f1': 0.9797628257971375, 'precision': 0.9797385510679781, 'recall': 0.9797898140662894}

--- Training bert with max_length=256 ---

===== Training bert =====


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.20972925424575806, 'eval_accuracy': 0.9822150363783346, 'eval_f1': 0.9818656145808211, 'eval_precision': 0.9820916888502574, 'eval_recall': 0.9822150363783346, 'eval_runtime': 3.4909, 'eval_samples_per_second': 354.348, 'eval_steps_per_second': 22.344, 'epoch': 1.0}
{'loss': 0.2578, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.16365188360214233, 'eval_accuracy': 0.9830234438156831, 'eval_f1': 0.9827643996136015, 'eval_precision': 0.9828558101771253, 'eval_recall': 0.9830234438156831, 'eval_runtime': 3.5279, 'eval_samples_per_second': 350.636, 'eval_steps_per_second': 22.11, 'epoch': 2.0}
{'train_runtime': 104.0068, 'train_samples_per_second': 95.128, 'train_steps_per_second': 5.961, 'train_loss': 0.2365403575281943, 'epoch': 2.0}
Training time for bert: 104.01 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for bert: 3.71 seconds

Classification Report for bert:
              precision    recall  f1-score   support

           0     0.9664    0.9000    0.9320       160
           1     0.9853    0.9954    0.9903      1077

    accuracy                         0.9830      1237
   macro avg     0.9759    0.9477    0.9612      1237
weighted avg     0.9829    0.9830    0.9828      1237

{'model': 'bert', 'max_length': 256, 'train_time_sec': 104.014390625, 'pred_time_sec': 3.7053486328125, 'accuracy': 0.9830234438156831, 'f1': 0.9827643996136015, 'precision': 0.9828558101771253, 'recall': 0.9830234438156831}

--- Training bert with max_length=512 ---

===== Training bert =====


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.24555543065071106, 'eval_accuracy': 0.9773645917542442, 'eval_f1': 0.9768523716258597, 'eval_precision': 0.9770860005971218, 'eval_recall': 0.9773645917542442, 'eval_runtime': 8.9603, 'eval_samples_per_second': 138.053, 'eval_steps_per_second': 8.705, 'epoch': 1.0}
{'loss': 0.2526, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2109844982624054, 'eval_accuracy': 0.9757477768795473, 'eval_f1': 0.9752712926102107, 'eval_precision': 0.975355850056463, 'eval_recall': 0.9757477768795473, 'eval_runtime': 8.9293, 'eval_samples_per_second': 138.533, 'eval_steps_per_second': 8.735, 'epoch': 2.0}
{'train_runtime': 230.205, 'train_samples_per_second': 42.979, 'train_steps_per_second': 2.693, 'train_loss': 0.22169009485552388, 'epoch': 2.0}
Training time for bert: 230.21 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for bert: 9.05 seconds

Classification Report for bert:
              precision    recall  f1-score   support

           0     0.9583    0.8625    0.9079       160
           1     0.9799    0.9944    0.9871      1077

    accuracy                         0.9774      1237
   macro avg     0.9691    0.9285    0.9475      1237
weighted avg     0.9771    0.9774    0.9769      1237

{'model': 'bert', 'max_length': 512, 'train_time_sec': 230.21215625, 'pred_time_sec': 9.0454580078125, 'accuracy': 0.9773645917542442, 'f1': 0.9768523716258597, 'precision': 0.9770860005971218, 'recall': 0.9773645917542442}

--- Training roberta with max_length=128 ---

===== Training roberta =====


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should pr

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.42896267771720886, 'eval_accuracy': 0.9717057396928052, 'eval_f1': 0.9703889418941817, 'eval_precision': 0.9720635934062385, 'eval_recall': 0.9717057396928052, 'eval_runtime': 1.7101, 'eval_samples_per_second': 723.36, 'eval_steps_per_second': 45.612, 'epoch': 1.0}
{'loss': 0.3148, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.21346116065979004, 'eval_accuracy': 0.9830234438156831, 'eval_f1': 0.9827643996136015, 'eval_precision': 0.9828558101771253, 'eval_recall': 0.9830234438156831, 'eval_runtime': 1.6672, 'eval_samples_per_second': 741.942, 'eval_steps_per_second': 46.784, 'epoch': 2.0}
{'train_runtime': 70.3441, 'train_samples_per_second': 140.651, 'train_steps_per_second': 8.814, 'train_loss': 0.28170710225259105, 'epoch': 2.0}
Training time for roberta: 70.35 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for roberta: 1.80 seconds

Classification Report for roberta:
              precision    recall  f1-score   support

           0     0.9664    0.9000    0.9320       160
           1     0.9853    0.9954    0.9903      1077

    accuracy                         0.9830      1237
   macro avg     0.9759    0.9477    0.9612      1237
weighted avg     0.9829    0.9830    0.9828      1237

{'model': 'roberta', 'max_length': 128, 'train_time_sec': 70.35253125, 'pred_time_sec': 1.798475830078125, 'accuracy': 0.9830234438156831, 'f1': 0.9827643996136015, 'precision': 0.9828558101771253, 'recall': 0.9830234438156831}

--- Training roberta with max_length=256 ---

===== Training roberta =====


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should pr

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.43413907289505005, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9693935710230959, 'eval_precision': 0.9715450815205092, 'eval_recall': 0.9708973322554567, 'eval_runtime': 3.3577, 'eval_samples_per_second': 368.409, 'eval_steps_per_second': 23.23, 'epoch': 1.0}
{'loss': 0.3597, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.25350719690322876, 'eval_accuracy': 0.9805982215036378, 'eval_f1': 0.9802170340881687, 'eval_precision': 0.9804077291518087, 'eval_recall': 0.9805982215036378, 'eval_runtime': 3.3625, 'eval_samples_per_second': 367.876, 'eval_steps_per_second': 23.197, 'epoch': 2.0}
{'train_runtime': 111.6651, 'train_samples_per_second': 88.604, 'train_steps_per_second': 5.552, 'train_loss': 0.3209612815610824, 'epoch': 2.0}
Training time for roberta: 111.67 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for roberta: 3.42 seconds

Classification Report for roberta:
              precision    recall  f1-score   support

           0     0.9658    0.8812    0.9216       160
           1     0.9826    0.9954    0.9889      1077

    accuracy                         0.9806      1237
   macro avg     0.9742    0.9383    0.9552      1237
weighted avg     0.9804    0.9806    0.9802      1237

{'model': 'roberta', 'max_length': 256, 'train_time_sec': 111.674484375, 'pred_time_sec': 3.4168330078125, 'accuracy': 0.9805982215036378, 'f1': 0.9802170340881687, 'precision': 0.9804077291518087, 'recall': 0.9805982215036378}

--- Training roberta with max_length=512 ---

===== Training roberta =====


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should pr

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.306217759847641, 'eval_accuracy': 0.9773645917542442, 'eval_f1': 0.9765716530616949, 'eval_precision': 0.9775374424549051, 'eval_recall': 0.9773645917542442, 'eval_runtime': 8.5258, 'eval_samples_per_second': 145.089, 'eval_steps_per_second': 9.149, 'epoch': 1.0}
{'loss': 0.3362, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.20447534322738647, 'eval_accuracy': 0.978981406628941, 'eval_f1': 0.9787508247974432, 'eval_precision': 0.9786998071233577, 'eval_recall': 0.978981406628941, 'eval_runtime': 8.7918, 'eval_samples_per_second': 140.699, 'eval_steps_per_second': 8.872, 'epoch': 2.0}
{'train_runtime': 240.2127, 'train_samples_per_second': 41.189, 'train_steps_per_second': 2.581, 'train_loss': 0.3002931840958134, 'epoch': 2.0}
Training time for roberta: 240.22 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for roberta: 8.61 seconds

Classification Report for roberta:
              precision    recall  f1-score   support

           0     0.9408    0.8938    0.9167       160
           1     0.9843    0.9916    0.9880      1077

    accuracy                         0.9790      1237
   macro avg     0.9626    0.9427    0.9523      1237
weighted avg     0.9787    0.9790    0.9788      1237

{'model': 'roberta', 'max_length': 512, 'train_time_sec': 240.22140625, 'pred_time_sec': 8.612994140625, 'accuracy': 0.978981406628941, 'f1': 0.9787508247974432, 'precision': 0.9786998071233577, 'recall': 0.978981406628941}

--- Training electra with max_length=128 ---

===== Training electra =====


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.295249879360199, 'eval_accuracy': 0.9749393694421988, 'eval_f1': 0.9738571024734121, 'eval_precision': 0.9753963172183064, 'eval_recall': 0.9749393694421988, 'eval_runtime': 0.8716, 'eval_samples_per_second': 1419.148, 'eval_steps_per_second': 89.485, 'epoch': 1.0}
{'loss': 0.2905, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2568901777267456, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9705780651041521, 'eval_precision': 0.9704327969897362, 'eval_recall': 0.9708973322554567, 'eval_runtime': 0.9521, 'eval_samples_per_second': 1299.299, 'eval_steps_per_second': 81.928, 'epoch': 2.0}
{'train_runtime': 32.7042, 'train_samples_per_second': 302.53, 'train_steps_per_second': 18.958, 'train_loss': 0.26939038615072924, 'epoch': 2.0}
Training time for electra: 32.71 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for electra: 0.94 seconds

Classification Report for electra:
              precision    recall  f1-score   support

           0     0.9924    0.8125    0.8935       160
           1     0.9729    0.9991    0.9858      1077

    accuracy                         0.9749      1237
   macro avg     0.9826    0.9058    0.9396      1237
weighted avg     0.9754    0.9749    0.9739      1237

{'model': 'electra', 'max_length': 128, 'train_time_sec': 32.711224609375, 'pred_time_sec': 0.9381182861328125, 'accuracy': 0.9749393694421988, 'f1': 0.9738571024734121, 'precision': 0.9753963172183064, 'recall': 0.9749393694421988}

--- Training electra with max_length=256 ---

===== Training electra =====


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3352561891078949, 'eval_accuracy': 0.9692805173807599, 'eval_f1': 0.9677987247099427, 'eval_precision': 0.9694906377088174, 'eval_recall': 0.9692805173807599, 'eval_runtime': 0.8202, 'eval_samples_per_second': 1508.238, 'eval_steps_per_second': 95.103, 'epoch': 1.0}
{'loss': 0.2899, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.23326022922992706, 'eval_accuracy': 0.9741309620048505, 'eval_f1': 0.9739197348103219, 'eval_precision': 0.9738066044069531, 'eval_recall': 0.9741309620048505, 'eval_runtime': 0.936, 'eval_samples_per_second': 1321.623, 'eval_steps_per_second': 83.336, 'epoch': 2.0}
{'train_runtime': 40.2549, 'train_samples_per_second': 245.784, 'train_steps_per_second': 15.402, 'train_loss': 0.2682643675035046, 'epoch': 2.0}
Training time for electra: 40.26 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for electra: 0.86 seconds

Classification Report for electra:
              precision    recall  f1-score   support

           0     0.9156    0.8812    0.8981       160
           1     0.9825    0.9879    0.9852      1077

    accuracy                         0.9741      1237
   macro avg     0.9490    0.9346    0.9416      1237
weighted avg     0.9738    0.9741    0.9739      1237

{'model': 'electra', 'max_length': 256, 'train_time_sec': 40.26189453125, 'pred_time_sec': 0.864918701171875, 'accuracy': 0.9741309620048505, 'f1': 0.9739197348103219, 'precision': 0.9738066044069531, 'recall': 0.9741309620048505}

--- Training electra with max_length=512 ---

===== Training electra =====


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3093465566635132, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9696887254279255, 'eval_precision': 0.9708669158286354, 'eval_recall': 0.9708973322554567, 'eval_runtime': 2.4145, 'eval_samples_per_second': 512.33, 'eval_steps_per_second': 32.305, 'epoch': 1.0}
{'loss': 0.2831, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.26695767045021057, 'eval_accuracy': 0.9692805173807599, 'eval_f1': 0.9688560203264991, 'eval_precision': 0.9687044929951657, 'eval_recall': 0.9692805173807599, 'eval_runtime': 2.428, 'eval_samples_per_second': 509.464, 'eval_steps_per_second': 32.125, 'epoch': 2.0}
{'train_runtime': 76.542, 'train_samples_per_second': 129.262, 'train_steps_per_second': 8.1, 'train_loss': 0.26500001107492754, 'epoch': 2.0}
Training time for electra: 76.55 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for electra: 2.43 seconds

Classification Report for electra:
              precision    recall  f1-score   support

           0     0.9697    0.8000    0.8767       160
           1     0.9710    0.9963    0.9835      1077

    accuracy                         0.9709      1237
   macro avg     0.9704    0.8981    0.9301      1237
weighted avg     0.9709    0.9709    0.9697      1237

{'model': 'electra', 'max_length': 512, 'train_time_sec': 76.5481953125, 'pred_time_sec': 2.4304140625, 'accuracy': 0.9708973322554567, 'f1': 0.9696887254279255, 'precision': 0.9708669158286354, 'recall': 0.9708973322554567}

--- Training albert with max_length=128 ---

===== Training albert =====


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.5847723484039307, 'eval_accuracy': 0.9571544058205336, 'eval_f1': 0.9537574054957654, 'eval_precision': 0.9582348243213428, 'eval_recall': 0.9571544058205336, 'eval_runtime': 2.058, 'eval_samples_per_second': 601.066, 'eval_steps_per_second': 37.901, 'epoch': 1.0}
{'loss': 0.4122, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.4008103013038635, 'eval_accuracy': 0.9684721099434115, 'eval_f1': 0.9673170445490343, 'eval_precision': 0.9680110400297258, 'eval_recall': 0.9684721099434115, 'eval_runtime': 2.1177, 'eval_samples_per_second': 584.116, 'eval_steps_per_second': 36.832, 'epoch': 2.0}
{'train_runtime': 52.3993, 'train_samples_per_second': 188.819, 'train_steps_per_second': 11.832, 'train_loss': 0.3993192857311618, 'epoch': 2.0}
Training time for albert: 52.40 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert: 2.10 seconds

Classification Report for albert:
              precision    recall  f1-score   support

           0     0.9481    0.8000    0.8678       160
           1     0.9710    0.9935    0.9821      1077

    accuracy                         0.9685      1237
   macro avg     0.9596    0.8968    0.9249      1237
weighted avg     0.9680    0.9685    0.9673      1237

{'model': 'albert', 'max_length': 128, 'train_time_sec': 52.4049140625, 'pred_time_sec': 2.095880615234375, 'accuracy': 0.9684721099434115, 'f1': 0.9673170445490343, 'precision': 0.9680110400297258, 'recall': 0.9684721099434115}

--- Training albert with max_length=256 ---

===== Training albert =====


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.48803243041038513, 'eval_accuracy': 0.9636216653193209, 'eval_f1': 0.9615507416118981, 'eval_precision': 0.9637498038746753, 'eval_recall': 0.9636216653193209, 'eval_runtime': 4.9767, 'eval_samples_per_second': 248.56, 'eval_steps_per_second': 15.673, 'epoch': 1.0}
{'loss': 0.4027, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3633476495742798, 'eval_accuracy': 0.957962813257882, 'eval_f1': 0.9578501186142488, 'eval_precision': 0.9577474459523042, 'eval_recall': 0.957962813257882, 'eval_runtime': 4.9936, 'eval_samples_per_second': 247.716, 'eval_steps_per_second': 15.62, 'epoch': 2.0}
{'train_runtime': 114.1407, 'train_samples_per_second': 86.683, 'train_steps_per_second': 5.432, 'train_loss': 0.3797174576790102, 'epoch': 2.0}
Training time for albert: 114.15 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert: 5.00 seconds

Classification Report for albert:
              precision    recall  f1-score   support

           0     0.9675    0.7438    0.8410       160
           1     0.9632    0.9963    0.9795      1077

    accuracy                         0.9636      1237
   macro avg     0.9653    0.8700    0.9102      1237
weighted avg     0.9637    0.9636    0.9616      1237

{'model': 'albert', 'max_length': 256, 'train_time_sec': 114.1465625, 'pred_time_sec': 5.001201171875, 'accuracy': 0.9636216653193209, 'f1': 0.9615507416118981, 'precision': 0.9637498038746753, 'recall': 0.9636216653193209}

--- Training albert with max_length=512 ---

===== Training albert =====


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.5117623805999756, 'eval_accuracy': 0.788197251414713, 'eval_f1': 0.8166650858902188, 'eval_precision': 0.87612981022703, 'eval_recall': 0.788197251414713, 'eval_runtime': 12.1886, 'eval_samples_per_second': 101.488, 'eval_steps_per_second': 6.399, 'epoch': 1.0}
{'loss': 0.5416, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.472799152135849, 'eval_accuracy': 0.9620048504446241, 'eval_f1': 0.9603639940725923, 'eval_precision': 0.9611996903405482, 'eval_recall': 0.9620048504446241, 'eval_runtime': 12.134, 'eval_samples_per_second': 101.945, 'eval_steps_per_second': 6.428, 'epoch': 2.0}
{'train_runtime': 270.341, 'train_samples_per_second': 36.598, 'train_steps_per_second': 2.293, 'train_loss': 0.5097725099132907, 'epoch': 2.0}
Training time for albert: 270.35 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert: 12.12 seconds

Classification Report for albert:
              precision    recall  f1-score   support

           0     0.9313    0.7625    0.8385       160
           1     0.9656    0.9916    0.9785      1077

    accuracy                         0.9620      1237
   macro avg     0.9485    0.8771    0.9085      1237
weighted avg     0.9612    0.9620    0.9604      1237

{'model': 'albert', 'max_length': 512, 'train_time_sec': 270.346375, 'pred_time_sec': 12.1168916015625, 'accuracy': 0.9620048504446241, 'f1': 0.9603639940725923, 'precision': 0.9611996903405482, 'recall': 0.9620048504446241}

--- Training xlnet with max_length=128 ---

===== Training xlnet =====


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.38627681136131287, 'eval_accuracy': 0.9749393694421988, 'eval_f1': 0.9741013910784672, 'eval_precision': 0.9748528800386662, 'eval_recall': 0.9749393694421988, 'eval_runtime': 2.3303, 'eval_samples_per_second': 530.822, 'eval_steps_per_second': 33.471, 'epoch': 1.0}
{'loss': 0.3283, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.260641872882843, 'eval_accuracy': 0.9652384801940178, 'eval_f1': 0.96528455305351, 'eval_precision': 0.965333168338002, 'eval_recall': 0.9652384801940178, 'eval_runtime': 2.3355, 'eval_samples_per_second': 529.658, 'eval_steps_per_second': 33.398, 'epoch': 2.0}
{'train_runtime': 79.5791, 'train_samples_per_second': 124.329, 'train_steps_per_second': 7.791, 'train_loss': 0.3019749826000583, 'epoch': 2.0}
Training time for xlnet: 79.59 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for xlnet: 2.43 seconds

Classification Report for xlnet:
              precision    recall  f1-score   support

           0     0.9708    0.8313    0.8956       160
           1     0.9755    0.9963    0.9858      1077

    accuracy                         0.9749      1237
   macro avg     0.9731    0.9138    0.9407      1237
weighted avg     0.9749    0.9749    0.9741      1237

{'model': 'xlnet', 'max_length': 128, 'train_time_sec': 79.58709375, 'pred_time_sec': 2.426048828125, 'accuracy': 0.9749393694421988, 'f1': 0.9741013910784672, 'precision': 0.9748528800386662, 'recall': 0.9749393694421988}

--- Training xlnet with max_length=256 ---

===== Training xlnet =====


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.5054289102554321, 'eval_accuracy': 0.9684721099434115, 'eval_f1': 0.9668973956381423, 'eval_precision': 0.9687071485135125, 'eval_recall': 0.9684721099434115, 'eval_runtime': 5.5305, 'eval_samples_per_second': 223.667, 'eval_steps_per_second': 14.104, 'epoch': 1.0}
{'loss': 0.357, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.31602728366851807, 'eval_accuracy': 0.9684721099434115, 'eval_f1': 0.9678991504476525, 'eval_precision': 0.9677933968303749, 'eval_recall': 0.9684721099434115, 'eval_runtime': 5.5423, 'eval_samples_per_second': 223.194, 'eval_steps_per_second': 14.074, 'epoch': 2.0}
{'train_runtime': 160.509, 'train_samples_per_second': 61.641, 'train_steps_per_second': 3.863, 'train_loss': 0.32652853381249214, 'epoch': 2.0}
Training time for xlnet: 160.52 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for xlnet: 5.64 seconds

Classification Report for xlnet:
              precision    recall  f1-score   support

           0     0.9116    0.8375    0.8730       160
           1     0.9761    0.9879    0.9820      1077

    accuracy                         0.9685      1237
   macro avg     0.9439    0.9127    0.9275      1237
weighted avg     0.9678    0.9685    0.9679      1237

{'model': 'xlnet', 'max_length': 256, 'train_time_sec': 160.51875, 'pred_time_sec': 5.63979296875, 'accuracy': 0.9684721099434115, 'f1': 0.9678991504476525, 'precision': 0.9677933968303749, 'recall': 0.9684721099434115}

--- Training xlnet with max_length=512 ---

===== Training xlnet =====


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.36374807357788086, 'eval_accuracy': 0.9644300727566694, 'eval_f1': 0.9637312291616426, 'eval_precision': 0.9635681321673231, 'eval_recall': 0.9644300727566694, 'eval_runtime': 16.5635, 'eval_samples_per_second': 74.682, 'eval_steps_per_second': 4.709, 'epoch': 1.0}
{'loss': 0.3291, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.23734351992607117, 'eval_accuracy': 0.9749393694421988, 'eval_f1': 0.9745569708581736, 'eval_precision': 0.9745141965585394, 'eval_recall': 0.9749393694421988, 'eval_runtime': 16.5359, 'eval_samples_per_second': 74.807, 'eval_steps_per_second': 4.717, 'epoch': 2.0}
{'train_runtime': 427.9047, 'train_samples_per_second': 23.122, 'train_steps_per_second': 1.449, 'train_loss': 0.3002438083771736, 'epoch': 2.0}
Training time for xlnet: 427.91 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for xlnet: 16.48 seconds

Classification Report for xlnet:
              precision    recall  f1-score   support

           0     0.9329    0.8688    0.8997       160
           1     0.9807    0.9907    0.9857      1077

    accuracy                         0.9749      1237
   macro avg     0.9568    0.9297    0.9427      1237
weighted avg     0.9745    0.9749    0.9746      1237

{'model': 'xlnet', 'max_length': 512, 'train_time_sec': 427.9133125, 'pred_time_sec': 16.477611328125, 'accuracy': 0.9749393694421988, 'f1': 0.9745569708581736, 'precision': 0.9745141965585394, 'recall': 0.9749393694421988}

--- Training mobilebert with max_length=128 ---

===== Training mobilebert =====


Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.dense.weight']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

You're using a MobileBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 7.775018215179443, 'eval_accuracy': 0.9620048504446241, 'eval_f1': 0.9602365791150439, 'eval_precision': 0.9613328008201311, 'eval_recall': 0.9620048504446241, 'eval_runtime': 3.2395, 'eval_samples_per_second': 381.846, 'eval_steps_per_second': 24.078, 'epoch': 1.0}
{'loss': 62947.676, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 1.36194908618927, 'eval_accuracy': 0.9660468876313663, 'eval_f1': 0.9652785574387894, 'eval_precision': 0.9652223728586796, 'eval_recall': 0.9660468876313663, 'eval_runtime': 2.6125, 'eval_samples_per_second': 473.495, 'eval_steps_per_second': 29.857, 'epoch': 2.0}
{'train_runtime': 146.1009, 'train_samples_per_second': 67.72, 'train_steps_per_second': 4.244, 'train_loss': 50764.41195059745, 'epoch': 2.0}
Training time for mobilebert: 146.13 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for mobilebert: 2.66 seconds

Classification Report for mobilebert:
              precision    recall  f1-score   support

           0     0.9097    0.8187    0.8618       160
           1     0.9735    0.9879    0.9806      1077

    accuracy                         0.9660      1237
   macro avg     0.9416    0.9033    0.9212      1237
weighted avg     0.9652    0.9660    0.9653      1237

{'model': 'mobilebert', 'max_length': 128, 'train_time_sec': 146.126796875, 'pred_time_sec': 2.658034912109375, 'accuracy': 0.9660468876313663, 'f1': 0.9652785574387894, 'precision': 0.9652223728586796, 'recall': 0.9660468876313663}

--- Training mobilebert with max_length=256 ---

===== Training mobilebert =====


Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.dense.weight']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

You're using a MobileBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.5493566989898682, 'eval_accuracy': 0.9700889248181084, 'eval_f1': 0.9686968814309919, 'eval_precision': 0.9702751279752208, 'eval_recall': 0.9700889248181084, 'eval_runtime': 2.6448, 'eval_samples_per_second': 467.703, 'eval_steps_per_second': 29.491, 'epoch': 1.0}
{'loss': 60764.664, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3112986981868744, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9725145088670764, 'eval_precision': 0.9730072098807431, 'eval_recall': 0.973322554567502, 'eval_runtime': 2.4611, 'eval_samples_per_second': 502.628, 'eval_steps_per_second': 31.694, 'epoch': 2.0}
{'train_runtime': 142.6492, 'train_samples_per_second': 69.359, 'train_steps_per_second': 4.346, 'train_loss': 49003.797861394574, 'epoch': 2.0}
Training time for mobilebert: 142.67 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for mobilebert: 2.50 seconds

Classification Report for mobilebert:
              precision    recall  f1-score   support

           0     0.9568    0.8313    0.8896       160
           1     0.9754    0.9944    0.9848      1077

    accuracy                         0.9733      1237
   macro avg     0.9661    0.9128    0.9372      1237
weighted avg     0.9730    0.9733    0.9725      1237

{'model': 'mobilebert', 'max_length': 256, 'train_time_sec': 142.673921875, 'pred_time_sec': 2.50087890625, 'accuracy': 0.973322554567502, 'f1': 0.9725145088670764, 'precision': 0.9730072098807431, 'recall': 0.973322554567502}

--- Training mobilebert with max_length=512 ---

===== Training mobilebert =====


Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.dense.weight']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

You're using a MobileBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3945624828338623, 'eval_accuracy': 0.9700889248181084, 'eval_f1': 0.9689930935465195, 'eval_precision': 0.9697592204337492, 'eval_recall': 0.9700889248181084, 'eval_runtime': 4.9037, 'eval_samples_per_second': 252.26, 'eval_steps_per_second': 15.906, 'epoch': 1.0}
{'loss': 58091.616, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.26719382405281067, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9725972048136188, 'eval_precision': 0.9729243876440257, 'eval_recall': 0.973322554567502, 'eval_runtime': 4.9235, 'eval_samples_per_second': 251.246, 'eval_steps_per_second': 15.843, 'epoch': 2.0}
{'train_runtime': 209.8274, 'train_samples_per_second': 47.153, 'train_steps_per_second': 2.955, 'train_loss': 46848.108150854416, 'epoch': 2.0}
Training time for mobilebert: 209.85 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for mobilebert: 4.94 seconds

Classification Report for mobilebert:
              precision    recall  f1-score   support

           0     0.9504    0.8375    0.8904       160
           1     0.9763    0.9935    0.9848      1077

    accuracy                         0.9733      1237
   macro avg     0.9633    0.9155    0.9376      1237
weighted avg     0.9729    0.9733    0.9726      1237

{'model': 'mobilebert', 'max_length': 512, 'train_time_sec': 209.85103125, 'pred_time_sec': 4.9377783203125, 'accuracy': 0.973322554567502, 'f1': 0.9725972048136188, 'precision': 0.9729243876440257, 'recall': 0.973322554567502}

--- Training albert-base-v1 with max_length=128 ---

===== Training albert-base-v1 =====


Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3856833875179291, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9696887254279255, 'eval_precision': 0.9708669158286354, 'eval_recall': 0.9708973322554567, 'eval_runtime': 1.7541, 'eval_samples_per_second': 705.189, 'eval_steps_per_second': 44.466, 'epoch': 1.0}
{'loss': 0.3129, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2440875768661499, 'eval_accuracy': 0.9717057396928052, 'eval_f1': 0.9711078155015639, 'eval_precision': 0.9711470685810719, 'eval_recall': 0.9717057396928052, 'eval_runtime': 1.7601, 'eval_samples_per_second': 702.804, 'eval_steps_per_second': 44.316, 'epoch': 2.0}
{'train_runtime': 43.5767, 'train_samples_per_second': 227.048, 'train_steps_per_second': 14.228, 'train_loss': 0.28822908401489256, 'epoch': 2.0}
Training time for albert-base-v1: 43.58 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert-base-v1: 1.76 seconds

Classification Report for albert-base-v1:
              precision    recall  f1-score   support

           0     0.9310    0.8438    0.8852       160
           1     0.9771    0.9907    0.9839      1077

    accuracy                         0.9717      1237
   macro avg     0.9541    0.9172    0.9346      1237
weighted avg     0.9711    0.9717    0.9711      1237

{'model': 'albert-base-v1', 'max_length': 128, 'train_time_sec': 43.58189453125, 'pred_time_sec': 1.757774169921875, 'accuracy': 0.9717057396928052, 'f1': 0.9711078155015639, 'precision': 0.9711470685810719, 'recall': 0.9717057396928052}

--- Training albert-base-v1 with max_length=256 ---

===== Training albert-base-v1 =====


Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.38635721802711487, 'eval_accuracy': 0.973322554567502, 'eval_f1': 0.9723451915414905, 'eval_precision': 0.9732555812417957, 'eval_recall': 0.973322554567502, 'eval_runtime': 3.6432, 'eval_samples_per_second': 339.541, 'eval_steps_per_second': 21.41, 'epoch': 1.0}
{'loss': 0.3056, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.22760340571403503, 'eval_accuracy': 0.9797898140662894, 'eval_f1': 0.9794225323382386, 'eval_precision': 0.9795440527564181, 'eval_recall': 0.9797898140662894, 'eval_runtime': 3.6572, 'eval_samples_per_second': 338.241, 'eval_steps_per_second': 21.328, 'epoch': 2.0}
{'train_runtime': 86.6825, 'train_samples_per_second': 114.141, 'train_steps_per_second': 7.153, 'train_loss': 0.27678232192993163, 'epoch': 2.0}
Training time for albert-base-v1: 86.69 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert-base-v1: 3.69 seconds

Classification Report for albert-base-v1:
              precision    recall  f1-score   support

           0     0.9592    0.8812    0.9186       160
           1     0.9826    0.9944    0.9885      1077

    accuracy                         0.9798      1237
   macro avg     0.9709    0.9378    0.9535      1237
weighted avg     0.9795    0.9798    0.9794      1237

{'model': 'albert-base-v1', 'max_length': 256, 'train_time_sec': 86.6874296875, 'pred_time_sec': 3.68730712890625, 'accuracy': 0.9797898140662894, 'f1': 0.9794225323382386, 'precision': 0.9795440527564181, 'recall': 0.9797898140662894}

--- Training albert-base-v1 with max_length=512 ---

===== Training albert-base-v1 =====


Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.4071013331413269, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9696887254279255, 'eval_precision': 0.9708669158286354, 'eval_recall': 0.9708973322554567, 'eval_runtime': 9.0622, 'eval_samples_per_second': 136.501, 'eval_steps_per_second': 8.607, 'epoch': 1.0}
{'loss': 0.293, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.22431425750255585, 'eval_accuracy': 0.9741309620048505, 'eval_f1': 0.9739197348103219, 'eval_precision': 0.9738066044069531, 'eval_recall': 0.9741309620048505, 'eval_runtime': 9.0207, 'eval_samples_per_second': 137.129, 'eval_steps_per_second': 8.647, 'epoch': 2.0}
{'train_runtime': 215.2808, 'train_samples_per_second': 45.959, 'train_steps_per_second': 2.88, 'train_loss': 0.2732754891918552, 'epoch': 2.0}
Training time for albert-base-v1: 215.29 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert-base-v1: 9.02 seconds

Classification Report for albert-base-v1:
              precision    recall  f1-score   support

           0     0.9156    0.8812    0.8981       160
           1     0.9825    0.9879    0.9852      1077

    accuracy                         0.9741      1237
   macro avg     0.9490    0.9346    0.9416      1237
weighted avg     0.9738    0.9741    0.9739      1237

{'model': 'albert-base-v1', 'max_length': 512, 'train_time_sec': 215.28553125, 'pred_time_sec': 9.0176826171875, 'accuracy': 0.9741309620048505, 'f1': 0.9739197348103219, 'precision': 0.9738066044069531, 'recall': 0.9741309620048505}

--- Training albert-large-v2 with max_length=128 ---

===== Training albert-large-v2 =====


Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mode

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.689713180065155, 'eval_accuracy': 0.8706548100242523, 'eval_f1': 0.8104539588557645, 'eval_precision': 0.7580397982183668, 'eval_recall': 0.8706548100242523, 'eval_runtime': 6.621, 'eval_samples_per_second': 186.829, 'eval_steps_per_second': 11.781, 'epoch': 1.0}
{'loss': 0.6534, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.692589282989502, 'eval_accuracy': 0.8706548100242523, 'eval_f1': 0.8104539588557645, 'eval_precision': 0.7580397982183668, 'eval_recall': 0.8706548100242523, 'eval_runtime': 6.6115, 'eval_samples_per_second': 187.098, 'eval_steps_per_second': 11.798, 'epoch': 2.0}
{'train_runtime': 155.6488, 'train_samples_per_second': 63.566, 'train_steps_per_second': 3.983, 'train_loss': 0.6623490118211316, 'epoch': 2.0}
Training time for albert-large-v2: 155.66 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction time for albert-large-v2: 6.45 seconds

Classification Report for albert-large-v2:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       160
           1     0.8707    1.0000    0.9309      1077

    accuracy                         0.8707      1237
   macro avg     0.4353    0.5000    0.4654      1237
weighted avg     0.7580    0.8707    0.8105      1237

{'model': 'albert-large-v2', 'max_length': 128, 'train_time_sec': 155.655171875, 'pred_time_sec': 6.45188037109375, 'accuracy': 0.8706548100242523, 'f1': 0.8104539588557645, 'precision': 0.7580397982183668, 'recall': 0.8706548100242523}

--- Training albert-large-v2 with max_length=256 ---

===== Training albert-large-v2 =====


Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mode

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.692845344543457, 'eval_accuracy': 0.8706548100242523, 'eval_f1': 0.8104539588557645, 'eval_precision': 0.7580397982183668, 'eval_recall': 0.8706548100242523, 'eval_runtime': 15.3258, 'eval_samples_per_second': 80.713, 'eval_steps_per_second': 5.089, 'epoch': 1.0}
{'loss': 0.7083, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.69178706407547, 'eval_accuracy': 0.8706548100242523, 'eval_f1': 0.8104539588557645, 'eval_precision': 0.7580397982183668, 'eval_recall': 0.8706548100242523, 'eval_runtime': 15.346, 'eval_samples_per_second': 80.607, 'eval_steps_per_second': 5.083, 'epoch': 2.0}
{'train_runtime': 342.6419, 'train_samples_per_second': 28.876, 'train_steps_per_second': 1.809, 'train_loss': 0.7065734124952747, 'epoch': 2.0}
Training time for albert-large-v2: 342.65 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction time for albert-large-v2: 15.10 seconds

Classification Report for albert-large-v2:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       160
           1     0.8707    1.0000    0.9309      1077

    accuracy                         0.8707      1237
   macro avg     0.4353    0.5000    0.4654      1237
weighted avg     0.7580    0.8707    0.8105      1237

{'model': 'albert-large-v2', 'max_length': 256, 'train_time_sec': 342.648125, 'pred_time_sec': 15.096859375, 'accuracy': 0.8706548100242523, 'f1': 0.8104539588557645, 'precision': 0.7580397982183668, 'recall': 0.8706548100242523}

--- Training albert-large-v2 with max_length=512 ---

===== Training albert-large-v2 =====


Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mode

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.4396832585334778, 'eval_accuracy': 0.8900565885206144, 'eval_f1': 0.8977182843313056, 'eval_precision': 0.9120539278515735, 'eval_recall': 0.8900565885206144, 'eval_runtime': 395.4489, 'eval_samples_per_second': 3.128, 'eval_steps_per_second': 0.197, 'epoch': 1.0}
{'loss': 0.5494, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.3511715531349182, 'eval_accuracy': 0.9708973322554567, 'eval_f1': 0.9698778396507508, 'eval_precision': 0.9705700302540556, 'eval_recall': 0.9708973322554567, 'eval_runtime': 371.4645, 'eval_samples_per_second': 3.33, 'eval_steps_per_second': 0.21, 'epoch': 2.0}
{'train_runtime': 4661.7777, 'train_samples_per_second': 2.122, 'train_steps_per_second': 0.133, 'train_loss': 0.5064292415495841, 'epoch': 2.0}
Training time for albert-large-v2: 4661.78 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert-large-v2: 371.64 seconds

Classification Report for albert-large-v2:
              precision    recall  f1-score   support

           0     0.9559    0.8125    0.8784       160
           1     0.9728    0.9944    0.9835      1077

    accuracy                         0.9709      1237
   macro avg     0.9643    0.9035    0.9309      1237
weighted avg     0.9706    0.9709    0.9699      1237

{'model': 'albert-large-v2', 'max_length': 512, 'train_time_sec': 4661.7805, 'pred_time_sec': 371.643625, 'accuracy': 0.9708973322554567, 'f1': 0.9698778396507508, 'precision': 0.9705700302540556, 'recall': 0.9708973322554567}

--- Training albert-xlarge-v2 with max_length=128 ---

===== Training albert-xlarge-v2 =====


Some weights of the model checkpoint at albert-xlarge-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xlarge-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mo

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.34922975301742554, 'eval_accuracy': 0.9506871463217461, 'eval_f1': 0.9502155927462494, 'eval_precision': 0.9498660170395512, 'eval_recall': 0.9506871463217461, 'eval_runtime': 22.6741, 'eval_samples_per_second': 54.556, 'eval_steps_per_second': 3.44, 'epoch': 1.0}
{'loss': 0.4891, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.38900822401046753, 'eval_accuracy': 0.9684721099434115, 'eval_f1': 0.9673170445490343, 'eval_precision': 0.9680110400297258, 'eval_recall': 0.9684721099434115, 'eval_runtime': 22.5656, 'eval_samples_per_second': 54.818, 'eval_steps_per_second': 3.457, 'epoch': 2.0}
{'train_runtime': 780.0783, 'train_samples_per_second': 12.683, 'train_steps_per_second': 0.795, 'train_loss': 0.4566705519153226, 'epoch': 2.0}
Training time for albert-xlarge-v2: 780.09 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

Prediction time for albert-xlarge-v2: 22.92 seconds

Classification Report for albert-xlarge-v2:
              precision    recall  f1-score   support

           0     0.9481    0.8000    0.8678       160
           1     0.9710    0.9935    0.9821      1077

    accuracy                         0.9685      1237
   macro avg     0.9596    0.8968    0.9249      1237
weighted avg     0.9680    0.9685    0.9673      1237

{'model': 'albert-xlarge-v2', 'max_length': 128, 'train_time_sec': 780.0850625, 'pred_time_sec': 22.922814453125, 'accuracy': 0.9684721099434115, 'f1': 0.9673170445490343, 'precision': 0.9680110400297258, 'recall': 0.9684721099434115}

--- Training albert-xlarge-v2 with max_length=256 ---

===== Training albert-xlarge-v2 =====


Some weights of the model checkpoint at albert-xlarge-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xlarge-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mo

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.6915135383605957, 'eval_accuracy': 0.8698464025869038, 'eval_f1': 0.8100515136931218, 'eval_precision': 0.7579486857492681, 'eval_recall': 0.8698464025869038, 'eval_runtime': 331.6743, 'eval_samples_per_second': 3.73, 'eval_steps_per_second': 0.235, 'epoch': 1.0}
{'loss': 0.7368, 'learning_rate': 9.67741935483871e-06, 'epoch': 1.61}


  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.690826952457428, 'eval_accuracy': 0.8706548100242523, 'eval_f1': 0.8104539588557645, 'eval_precision': 0.7580397982183668, 'eval_recall': 0.8706548100242523, 'eval_runtime': 330.5914, 'eval_samples_per_second': 3.742, 'eval_steps_per_second': 0.236, 'epoch': 2.0}
{'train_runtime': 2655.4206, 'train_samples_per_second': 3.726, 'train_steps_per_second': 0.233, 'train_loss': 0.733499502366589, 'epoch': 2.0}
Training time for albert-xlarge-v2: 2655.43 seconds


  0%|          | 0/78 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction time for albert-xlarge-v2: 331.56 seconds

Classification Report for albert-xlarge-v2:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       160
           1     0.8707    1.0000    0.9309      1077

    accuracy                         0.8707      1237
   macro avg     0.4353    0.5000    0.4654      1237
weighted avg     0.7580    0.8707    0.8105      1237

{'model': 'albert-xlarge-v2', 'max_length': 256, 'train_time_sec': 2655.4265, 'pred_time_sec': 331.5628125, 'accuracy': 0.8706548100242523, 'f1': 0.8104539588557645, 'precision': 0.7580397982183668, 'recall': 0.8706548100242523}

--- Training albert-xlarge-v2 with max_length=512 ---

===== Training albert-xlarge-v2 =====


Some weights of the model checkpoint at albert-xlarge-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xlarge-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mo

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/620 [00:00<?, ?it/s]

KeyboardInterrupt: 