In [1]:
import os
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Subset
from transformers import EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold 
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, premises, hypothesis1, hypothesis2, hypothesis3, labels, tokenizer_name='emilyalsentzer/Bio_ClinicalBERT'):
        self.premises = premises
        self.hypothesis1 = hypothesis1
        self.hypothesis2 = hypothesis2
        self.hypothesis3 = hypothesis3
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        separator = self.tokenizer.sep_token
        grouped_hypotheses = separator.join([self.hypothesis1[idx], self.hypothesis2[idx], self.hypothesis3[idx]])
        
        tokenized_input = self.tokenizer(
        text=self.premises[idx],
        text_pair=grouped_hypotheses,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt')
        tokenized_input = {key: val.squeeze(0) for key, val in tokenized_input.items()}
        tokenized_input['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return tokenized_input

In [3]:
data = pd.read_excel('/home/elson/topk3_minilm.xlsx',engine='openpyxl')
df= data.dropna(subset=['label'])
label_encoder = LabelEncoder()
claims = df.claim.tolist()
labels = df.label.tolist()
encoded_labels = label_encoder.fit_transform(labels)
evidence_1 = df.top_1_minilm_ce.to_list()
evidence_2 = df.top_2_minilm_ce.to_list()
evidence_3 = df.top_3_minilm_ce.to_list()

In [4]:
print(len(data))

861


In [5]:
print(encoded_labels)
print(label_encoder.classes_)

[2 2 2 0 2 1 2 2 2 2 0 0 1 2 1 2 2 0 2 0 2 0 2 2 1 0 2 2 2 2 2 2 2 2 1 2 2
 2 2 0 2 0 2 2 1 1 1 1 1 0 0 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 0 1 0 1
 2 2 0 2 2 2 0 1 1 1 1 0 1 2 2 2 2 2 2 0 2 2 2 1 2 2 2 2 2 2 1 2 1 2 2 2 2
 2 1 2 1 2 0 1 1 2 2 1 2 2 2 2 1 1 1 2 0 1 2 2 2 0 0 1 1 1 1 2 2 1 1 2 1 2
 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2
 2 1 2 2 2 2 2 2 2 1 0 1 1 1 2 2 2 2 2 1 2 2 2 2 2 1 0 2 2 2 2 2 1 2 2 1 1
 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 0 2 2 2 1 2
 1 2 1 1 2 2 2 1 2 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 0 2 2 0 2 2 2 2 1 2 2 1 2
 2 2 2 0 2 2 1 1 0 2 1 1 1 0 1 1 1 1 1 2 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1
 1 2 1 1 1 1 2 1 1 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 2 2 2 2 2 2 2 2 2 2 1
 1 1 2 1 2 0 1 2 0 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 0 2 2 2 1 2 1 2 0 1 0 1
 2 1 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 2 2 2 0 0 2 2 0 2 2 2 1 0 1 2 2 2 2 2
 2 0 2 0 2 2 2 0 1 2 2 1 

In [6]:
from sklearn.model_selection import train_test_split

# Perform the split
train_premises, test_premises, train_hypothesis1, test_hypothesis1,train_hypothesis2, test_hypothesis2, train_hypothesis3, test_hypothesis3, train_labels, test_labels = train_test_split(
    claims, evidence_1,evidence_2,evidence_3, encoded_labels, test_size=0.2, random_state=42)

In [7]:
import torch
print(torch.cuda.device_count())
print("Available GPUs:")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

4
Available GPUs:
GPU 0: Tesla V100-SXM2-32GB
GPU 1: Tesla V100-SXM2-32GB
GPU 2: Tesla V100-SXM2-32GB
GPU 3: Tesla V100-SXM2-32GB


In [8]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                 num_labels=3, ignore_mismatched_sizes=True)
device = "cuda:3"
model.to(device)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted")  # Specify average method
    recall = recall_score(labels, preds, average="weighted")  # Specify average method

    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

In [10]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [11]:
import gc

torch.cuda.set_device(3)

# Clearing the cache
torch.cuda.empty_cache()
gc.collect()
# Checking GPU memory, making sure to reset peak memory stats
torch.cuda.reset_peak_memory_stats()

os.environ["CUDA_VISIBLE_DEVICES"] = "3"



In [12]:
current_device = torch.cuda.current_device()
print(f"Current CUDA device: GPU {current_device}")

Current CUDA device: GPU 3


In [13]:
k = 5

kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
labels = np.array(train_labels)  # Ensure train_labels is a list or numpy array of your labels
dataset = MediClaimDataset(train_premises, train_hypothesis1, train_hypothesis2, train_hypothesis3, train_labels)
model = model.to('cuda:3')
for fold, (train_idx, val_idx) in enumerate(kf.split(np.zeros(len(labels)), labels)):
    print(f"Starting fold {fold + 1}/{k}")

    # Splitting the dataset
    train_subs = Subset(dataset, train_idx)
    val_subs = Subset(dataset, val_idx)

    train_loader = DataLoader(train_subs, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_subs, batch_size=8)

    training_args = TrainingArguments(
    output_dir=f'/home/elson/bioclinicalbert/results/fold_{fold}',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    fp16=True,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_dir=f'./logs/fold_{fold}',
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save at the end of each epoch to match the evaluation strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Ensure this metric is returned from your compute_metrics function
    report_to="none")


    trainer = Trainer(
        model=model.to(device),
        args=training_args,
        train_dataset=train_subs,
        eval_dataset=val_subs,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
    )

    # Training and Evaluation
    trainer.train()
    eval_result = trainer.evaluate()

    # Optionally, you can save model for each fold
    model.save_pretrained(f'/home/elson/bioclinicalbert/model_fold_{fold}')
    tokenizer.save_pretrained(f'/home/elson/bioclinicalbert/tokenizer_fold_{fold}')

    print(f"Fold {fold + 1} completed. Eval Result: {eval_result}")


Starting fold 1/5


Using amp half precision backend
***** Running training *****
  Num examples = 495
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 80


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9318,0.841315,0.645161,0.416233,0.645161,0.506009
2,0.922,0.80991,0.677419,0.586095,0.677419,0.593496
3,0.7366,0.78173,0.693548,0.610753,0.693548,0.643113
4,0.6447,0.80583,0.685484,0.611863,0.685484,0.645718
5,0.4925,0.832284,0.66129,0.619501,0.66129,0.637097


***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_0/checkpoint-16
Configuration saved in /home/elson/bioclinicalbert/results/fold_0/checkpoint-16/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_0/checkpoint-16/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_0/checkpoint-16/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/results/fold_0/checkpoint-16/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_0/checkpoint-32
Configuration saved in /home/elson/bioclinicalbert/results/fold_0/checkpoint-32/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_0/checkpoint-32/py

  _warn_prf(average, modifier, msg_start, len(result))
Configuration saved in /home/elson/bioclinicalbert/model_fold_0/config.json
Model weights saved in /home/elson/bioclinicalbert/model_fold_0/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/tokenizer_fold_0/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/tokenizer_fold_0/special_tokens_map.json
PyTorch: setting up devices
Using amp half precision backend
***** Running training *****
  Num examples = 495
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 80


Fold 1 completed. Eval Result: {'eval_loss': 0.7817295789718628, 'eval_accuracy': 0.6935483870967742, 'eval_precision': 0.610752688172043, 'eval_recall': 0.6935483870967742, 'eval_f1': 0.6431131592421915, 'eval_runtime': 0.9253, 'eval_samples_per_second': 134.005, 'eval_steps_per_second': 4.323, 'epoch': 5.0}
Starting fold 2/5




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6719,0.636466,0.741935,0.676884,0.741935,0.707337
2,0.4634,0.662282,0.741935,0.676884,0.741935,0.707337
3,0.3345,0.693015,0.758065,0.68752,0.758065,0.720829
4,0.2008,0.761648,0.75,0.722086,0.75,0.725461
5,0.189,0.754237,0.766129,0.75378,0.766129,0.75779


***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_1/checkpoint-16
Configuration saved in /home/elson/bioclinicalbert/results/fold_1/checkpoint-16/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_1/checkpoint-16/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_1/checkpoint-16/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/results/fold_1/checkpoint-16/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_1/checkpoint-32
Configuration saved in /home/elson/bioclinicalbert/results/fold_1/checkpoint-32/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_1/checkpoint-32/py

Configuration saved in /home/elson/bioclinicalbert/model_fold_1/config.json
Model weights saved in /home/elson/bioclinicalbert/model_fold_1/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/tokenizer_fold_1/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/tokenizer_fold_1/special_tokens_map.json
PyTorch: setting up devices
Using amp half precision backend
***** Running training *****
  Num examples = 495
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 80


Fold 2 completed. Eval Result: {'eval_loss': 0.7542372941970825, 'eval_accuracy': 0.7661290322580645, 'eval_precision': 0.7537797637265103, 'eval_recall': 0.7661290322580645, 'eval_f1': 0.7577898803705257, 'eval_runtime': 0.9278, 'eval_samples_per_second': 133.656, 'eval_steps_per_second': 4.311, 'epoch': 5.0}
Starting fold 3/5




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2835,0.1805,0.943548,0.951188,0.943548,0.937558
2,0.2425,0.164233,0.943548,0.951188,0.943548,0.937558
3,0.1721,0.118796,0.951613,0.955591,0.951613,0.944597
4,0.0936,0.131001,0.951613,0.955591,0.951613,0.944597


***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_2/checkpoint-16
Configuration saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-16/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-16/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-16/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-16/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_2/checkpoint-32
Configuration saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-32/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-32/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_2/checkpoint-32/tokeni

Configuration saved in /home/elson/bioclinicalbert/model_fold_2/config.json
Model weights saved in /home/elson/bioclinicalbert/model_fold_2/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/tokenizer_fold_2/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/tokenizer_fold_2/special_tokens_map.json
PyTorch: setting up devices
Using amp half precision backend
***** Running training *****
  Num examples = 495
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 80


Fold 3 completed. Eval Result: {'eval_loss': 0.11879612505435944, 'eval_accuracy': 0.9516129032258065, 'eval_precision': 0.9555905236471719, 'eval_recall': 0.9516129032258065, 'eval_f1': 0.9445967156277085, 'eval_runtime': 0.9287, 'eval_samples_per_second': 133.519, 'eval_steps_per_second': 4.307, 'epoch': 4.0}
Starting fold 4/5




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1144,0.065466,0.975806,0.97788,0.975806,0.97462
2,0.1102,0.067437,0.975806,0.97788,0.975806,0.97462
3,0.047,0.061788,0.975806,0.975806,0.975806,0.975806
4,0.0212,0.064716,0.967742,0.969265,0.967742,0.966528


***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_3/checkpoint-16
Configuration saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-16/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-16/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-16/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-16/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 124
  Batch size = 8
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_3/checkpoint-32
Configuration saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-32/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-32/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_3/checkpoint-32/tokeni

Configuration saved in /home/elson/bioclinicalbert/model_fold_3/config.json
Model weights saved in /home/elson/bioclinicalbert/model_fold_3/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/tokenizer_fold_3/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/tokenizer_fold_3/special_tokens_map.json
PyTorch: setting up devices
Using amp half precision backend
***** Running training *****
  Num examples = 496
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 80


Fold 4 completed. Eval Result: {'eval_loss': 0.06546633690595627, 'eval_accuracy': 0.9758064516129032, 'eval_precision': 0.9778801843317972, 'eval_recall': 0.9758064516129032, 'eval_f1': 0.9746199876195061, 'eval_runtime': 1.1106, 'eval_samples_per_second': 111.649, 'eval_steps_per_second': 3.602, 'epoch': 4.0}
Starting fold 5/5




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1061,0.069421,0.97561,0.976694,0.97561,0.973963
2,0.085,0.113103,0.96748,0.966658,0.96748,0.966108
3,0.0609,0.103823,0.97561,0.976002,0.97561,0.974896
4,0.0177,0.093925,0.97561,0.976217,0.97561,0.975015


***** Running Evaluation *****
  Num examples = 123
  Batch size = 8
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_4/checkpoint-16
Configuration saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-16/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-16/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-16/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-16/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 123
  Batch size = 8
Saving model checkpoint to /home/elson/bioclinicalbert/results/fold_4/checkpoint-32
Configuration saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-32/config.json
Model weights saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-32/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/results/fold_4/checkpoint-32/tokeni

Configuration saved in /home/elson/bioclinicalbert/model_fold_4/config.json
Model weights saved in /home/elson/bioclinicalbert/model_fold_4/pytorch_model.bin
tokenizer config file saved in /home/elson/bioclinicalbert/tokenizer_fold_4/tokenizer_config.json
Special tokens file saved in /home/elson/bioclinicalbert/tokenizer_fold_4/special_tokens_map.json


Fold 5 completed. Eval Result: {'eval_loss': 0.06942062824964523, 'eval_accuracy': 0.975609756097561, 'eval_precision': 0.9766943678305963, 'eval_recall': 0.975609756097561, 'eval_f1': 0.973963214466662, 'eval_runtime': 0.9054, 'eval_samples_per_second': 135.845, 'eval_steps_per_second': 4.418, 'epoch': 4.0}


In [14]:
from transformers import AutoModelForSequenceClassification

# Assuming test_dataset is prepared similarly to your training/validation datasets
test_dataset = MediClaimDataset(test_premises, test_hypothesis1, test_hypothesis2, test_hypothesis3, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load the best model from this fold
model_path = f'/home/elson/bioclinicalbert/model_fold_4'
model = AutoModelForSequenceClassification.from_pretrained(model_path).to('cuda:3')

# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)
print(f"Test Results for Fold {fold + 1}: {test_results}")

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT/resolve/main/config.json from cache at /home/elson/.cache/huggingface/transformers/dc6d60ebe42d83e1479ce0d473758bb3586763ff6c4c814bda5321acf856bd64.b74d0770929e519c6d193d16b6874051ae549f5c8c228903a48e59d36260466b
Model config BertConfig {
  "_name_or_path": "emilyalsentzer/Bio_ClinicalBERT",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https:/

Test Results for Fold 5: {'eval_loss': 1.1219918727874756, 'eval_accuracy': 0.6903225806451613, 'eval_precision': 0.6887250384024577, 'eval_recall': 0.6903225806451613, 'eval_f1': 0.6894720173732073, 'eval_runtime': 1.1372, 'eval_samples_per_second': 136.299, 'eval_steps_per_second': 4.397, 'epoch': 4.0}
