# Fine-tune BLIP using Hugging Face `transformers`, `datasets`, `peft` 🤗 and `bitsandbytes`

Let's leverage recent advances from Parameter Efficient Fine-Tuning methods to fine-tune a large image to text model! We will show through this tutorial that it is possible to fine-tune a 3B scale model (~6GB in half-precision)

Here we will use a dummy dataset of [football players](https://huggingface.co/datasets/ybelkada/football-dataset) ⚽ that is uploaded on the Hub. The images have been manually selected together with the captions. 
Check the 🤗 [documentation](https://huggingface.co/docs/datasets/image_dataset) on how to create and upload your own image-text dataset.

## Set-up environment

## Load the image captioning dataset

Let's load the image captioning dataset, you just need few lines of code for that.

In [1]:
from datasets import load_dataset 
from PIL import Image
import pandas as pd
import numpy as np
import torch
import random
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from time import time
import re
random.seed(123)

seed=777
# random.seed(seed)

# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)

# # Garante que algumas operações sejam determinísticas
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

#dataset = load_dataset("ybelkada/football-dataset", split="train")
# embds_path = './nodules_luna_lidc_3d_metadas_embeddings_from_bbox'
embds_path = './nodules_luna_lidc_3d_metadas_embeddings'

is_training = True
is_evaluating = True



  from .autonotebook import tqdm as notebook_tqdm


Let's retrieve the caption of the first example:

## Create PyTorch Dataset

Let's define below the dataset as well as the data collator!

In [2]:
def preprocessing(df):
    new_synthetic_legends = []
    
    for idx, row in df.iterrows():
        legend = str(row.synthetic_legends)
        if 'highly unlikely' in legend.lower():
            legend = legend.replace('highly unlikely', 'remote')
        elif 'moderately unlikely' in legend.lower():
            legend = legend.replace('moderately unlikely', 'possible')
        elif 'moderately suspicious' in legend.lower():
            legend = legend.replace('moderately suspicious', 'doubtful')
        elif 'highly suspicious' in legend.lower():
            legend = legend.replace('highly suspicious', 'critical')
            
        new_synthetic_legends.append(legend)

    df['synthetic_legends'] = new_synthetic_legends
    return df.query("malignancy != 'Indeterminate'").copy()


In [3]:
df = pd.read_csv('annotations_with_legends.csv') 
df = preprocessing(df)
df_train = df.sample(frac=0.8, random_state=seed)
df_test = df.drop(df_train.index)

In [4]:
from torch.utils.data import Dataset, DataLoader


class ImageCaptioningDataset(Dataset):
    def __init__(self, df, embds_path, processor):
        self.dataset = df
        self.processor = processor
        self.embds_path = embds_path

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset.iloc[idx]
        image = np.load(f'{self.embds_path}/{item["filename"]}')
        #print(image_rgb.min(), image_rgb.max())
        #encoding = self.processor(images=image, padding="max_length", return_tensors="pt")
        # remove batch dimension
        #encoding = {k: v.squeeze() for k, v in encoding.items()}
        #encoding["text"] = item["text"]
        #text_inputs = self.processor.tokenizer(item['report_text'] ,return_tensors="pt")
        return {'embds': torch.from_numpy(image).squeeze(0), 'text': item['synthetic_legends']}

def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer([example["text"] for example in batch], padding=True, return_tensors="pt")
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
            # processed_batch["original_text"] = [example["text"] for example in batch]
            
    return processed_batch


In [5]:
class VisionModelOutput:
    def __init__(self, last_hidden_state):
        self.last_hidden_state = last_hidden_state

# Adaptador para ajustar a saída do ViT3D para o Q-Former
class ViT3DAdapter(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.training = True

    def forward(self, pixel_values=None, **kwargs):
        if self.training:
            return pixel_values.unsqueeze(0)
        else:
            return VisionModelOutput(last_hidden_state=pixel_values)

In [6]:
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

In [7]:

if is_training:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    
    #model = Blip2ForConditionalGeneration.from_pretrained("ybelkada/blip2-opt-2.7b-fp16-sharded", device_map="auto", load_in_8bit=True)
    #model = Blip2ForConditionalGeneration.from_pretrained("ybelkada/blip2-opt-2.7b-fp16-sharded", 
    # _map="auto", load_in_8bit=True)
    
    model = Blip2ForConditionalGeneration.from_pretrained(
        "ybelkada/blip2-opt-2.7b-fp16-sharded",
        device_map="auto",
        torch_dtype=torch.float32
    )
    
    dtype = next(model.parameters()).dtype
    print(dtype)
    
    adapted_vit3d = ViT3DAdapter().to(device)
    model.vision_model = adapted_vit3d

Loading checkpoint shards: 100%|██████████| 8/8 [00:04<00:00,  1.79it/s]

torch.float32





In [8]:
# Checar se o modelo está congelado
def check_frozen_layers(model):
    frozen_layers = []
    trainable_layers = []

    for name, param in model.named_parameters():
        if param.requires_grad:
            trainable_layers.append(name)
        else:
            frozen_layers.append(name)
    
    print("Camadas congeladas:")
    for layer in frozen_layers:
        print(layer)
    
    print("\nCamadas treináveis:")
    for layer in trainable_layers:
        print(layer)

#check_frozen_layers(model)

Next we define our `LoraConfig` object. We explicitly tell 

In [9]:
if is_training:
    from peft import LoraConfig, get_peft_model
    import torch
    
    
    # Let's define the LoraConfig
    config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=["q_proj", "k_proj"]
    )
    
    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    
    # Congelar todas as camadas do modelo, exceto o Q-Former (apenas q_proj e k_proj)
    # for name, param in model.named_parameters():
    #     # Mantenha apenas as camadas do Q-Former como treináveis e cheque o tipo de dado
    #     if "qformer" in name:
    #         #if ('attention' in name or 'crossattention' in name) and torch.is_floating_point(param):
    #         param.requires_grad = True
    #     else:
    #         param.requires_grad = False
    
    # Congelar todas as camadas do modelo, exceto o Q-Former
    # for name, param in model.named_parameters():
    #     # Checar se o parâmetro faz parte do Q-Former e se é de ponto flutuante
    #     if "qformer" in name and torch.is_floating_point(param):
    #         param.requires_grad = True
    #     else:
    #         param.requires_grad = False
    
    
    # Configurar todos os parâmetros como treináveis, verificando se são de ponto flutuante
    # for param in model.parameters():
    #     if torch.is_floating_point(param):
    #         param.requires_grad = True
    
    # Exibir parâmetros treináveis para verificação
    print("Parâmetros treináveis no Q-Former:")
    for name, param in model.named_parameters():
        if param.requires_grad:
            # print(name)
            pass
    #model.print_trainable_parameters()


trainable params: 5,242,880 || all params: 2,763,970,560 || trainable%: 0.1897
Parâmetros treináveis no Q-Former:


Now that we have loaded the processor, let's load the dataset and the dataloader:

In [10]:
train_dataset = ImageCaptioningDataset(df_train, embds_path, processor)
test_dataset = ImageCaptioningDataset(df_test, embds_path, processor)
# train_dataset_ = train_dataset_[:100]
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=4, collate_fn=collate_fn)
#train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)

In [11]:
for i in range(10):
    print(f"Sample {i+1}")    
    d = train_dataset[i]
    print("LIDC-LUNA Image embed: ", d['embds'])
    print("Text Report: ", d['text'])
    print("-"*150)

Sample 1
LIDC-LUNA Image embed:  tensor([[-0.4428,  0.8732, -0.2268,  ..., -2.1778, -1.6099,  0.3623],
        [-0.5614,  0.8245, -0.1020,  ..., -2.1546, -1.5478,  0.4737],
        [-0.5594,  0.8788, -0.1258,  ..., -2.3047, -1.5051,  0.3400],
        ...,
        [-0.1541,  0.8606,  1.4765,  ..., -1.0063,  1.3016,  0.0456],
        [-0.4812,  0.8993, -0.1662,  ..., -2.1800, -1.5553,  0.3346],
        [-0.4368,  0.7845, -0.0380,  ..., -2.1643, -1.5896,  0.3561]])
Text Report:  Obvious lung nodule with a soft tissue internal structure, no calcification, and an ovoid shape. The margin is near sharp with medium lobulation and marked spiculation. The texture is solid and the malignancy is critical. The nodule measures 22.47 mm in diameter.
------------------------------------------------------------------------------------------------------------------------------------------------------
Sample 2
LIDC-LUNA Image embed:  tensor([[-0.3340,  0.8224, -0.2476,  ..., -2.1362, -1.6752,  0.4232],
 

## Train the model

Let's train the model! Run the simply the cell below for training the model

In [12]:
if is_training:
    import torch
    import os
    
    epochs = 10
    lr = 5e-4
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Diretório para salvar o melhor modelo
    checkpoint_dir = "./checkpoints2"
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    best_model_path = os.path.join(checkpoint_dir, "best_model.pt")
    best_loss = float('inf')  # Define a melhor perda inicial como infinita
    
    for epoch in range(epochs):
        start_time = time()
        train_loss = 0
        total_train_samples = 0
        total_val_samples = 0
        validation_loss = 0
        model.train()
        model.vision_model.training = True
      
        for idx, batch in enumerate(train_dataloader):
            input_ids = batch.pop("input_ids").to(device)
            embds = batch.pop("embds").to(device, torch.float32)
            attention_mask = batch.pop("attention_mask").to(device)
    
            # print(embds.size(), input_ids.size())
            #outputs = model(input_ids=input_ids, pixel_values=embds, labels=input_ids)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=embds, labels=input_ids)
    
            loss = outputs.loss
            # print("Loss:", loss.item())
        
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    
            train_loss += loss.item()
            total_train_samples += len(input_ids)
        
        model.eval()
        validation_loss = 0
        total_val_samples = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                input_ids = batch.pop("input_ids").to(device)
                embds = batch.pop("embds").to(device, torch.float32)
                attention_mask = batch.pop("attention_mask").to(device)
                # print(f"EMDBS SIZE: {embds.size()}")
                # original_text = batch.pop("original_text")
                model.vision_model.training = True
                # outputs = model(input_ids=input_ids, pixel_values=embds, labels=input_ids)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=embds, labels=input_ids)
    
                loss = outputs.loss
                validation_loss += loss.item()
                total_val_samples += len(input_ids)
                model.vision_model.training = False
                # generated_ids = model.generate(pixel_values=embds, max_length=500)
                # generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)
                # original_text = processor.batch_decode(input_ids, skip_special_tokens=True)
                
                # for original, generated in zip(original_text, generated_caption):
                #     print("Original Text -->", original)
                #     print("\n")
                #     print("Generated Text -->", generated)
                #     print("\n")
                # print("\n")
    
         # Calcula a perda média de avaliação
        avg_eval_loss = validation_loss / total_val_samples
        print(f"Validation Loss (Epoch {epoch}): {avg_eval_loss}")
    
        # Salva o modelo se a perda de validação for a melhor até agora
        if avg_eval_loss < best_loss:
            best_loss = avg_eval_loss
            # torch.save({
            #     'epoch': epoch,
            #     'model_state_dict': model.state_dict(),
            #     'optimizer_state_dict': optimizer.state_dict(),
            #     'validation_loss': best_loss,
            # }, best_model_path)
            # print(f"New best model saved with validation loss {best_loss} at {best_model_path}")
            print(f"Saving best model with loss {best_loss}")
            model.save_pretrained(checkpoint_dir)  # Salva o modelo no diretório
            optimizer_state = {"optimizer": optimizer.state_dict(), "epoch": epoch}
            torch.save(optimizer_state, os.path.join(checkpoint_dir, "optimizer.pt"))  # Salva o otimizador
    
    
        end_time = time()  # End time of the epoch
        epoch_duration = end_time - start_time  # Duration of epoch
    
        print(f'Epoch [{epoch+1}/{epochs}], \
                Train Loss: {(train_loss/total_train_samples):.4f}, \
                Val Loss: {(validation_loss/total_val_samples):.4f}, \
                Elapsed Time: {epoch_duration:.2f} sec'
             )

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


Validation Loss (Epoch 0): 0.16933943393329778
Saving best model with loss 0.16933943393329778
Epoch [1/10],                 Train Loss: 0.3979,                 Val Loss: 0.1693,                 Elapsed Time: 51.67 sec
Validation Loss (Epoch 1): 0.13911467749211523
Saving best model with loss 0.13911467749211523
Epoch [2/10],                 Train Loss: 0.1645,                 Val Loss: 0.1391,                 Elapsed Time: 50.90 sec
Validation Loss (Epoch 2): 0.13747636042535305
Saving best model with loss 0.13747636042535305
Epoch [3/10],                 Train Loss: 0.1410,                 Val Loss: 0.1375,                 Elapsed Time: 51.06 sec
Validation Loss (Epoch 3): 0.12501623233159384
Saving best model with loss 0.12501623233159384
Epoch [4/10],                 Train Loss: 0.1296,                 Val Loss: 0.1250,                 Elapsed Time: 51.93 sec
Validation Loss (Epoch 4): 0.12398934033181933
Saving best model with loss 0.12398934033181933
Epoch [5/10],                

## Inference

Let's check the results on our train dataset

In [13]:
import re

def get_features(text):
    malig_values = ['Remote', 'Possible', 'Doubtful', 'Critical']
    # Expressão regular para extrair o diâmetro (valor numérico seguido de "mm")
    diameter_match = re.search(r'(\d+(\.\d+)?)\s*mm', text)
    
    # Extrair os valores encontrados
    if diameter_match:
        diameter = float(diameter_match.group(1))
      
    malignancy = [malig for malig in malig_values if malig.lower() in text.lower()][0]
    
    return malignancy, diameter

def clip_text(text):

    # Extrair a parte da string até "malignancy"
    pattern = r'(.*malignancy:\s[^,]+)'

    # Busca a parte que termina com "malignancy"
    match = re.search(pattern, text)
    if match:
        text_until_malignancy = match.group(1)
        #print(text_until_malignancy)
        return text_until_malignancy
    else:
        print("Nenhum valor até 'malignancy' foi encontrado.")
        return text

In [14]:
if is_evaluating:
    import os
    from bert_score import score
    from rouge_score import rouge_scorer

    malig_gt_labels = []
    malig_pred_labels = []
    #malig_map = {'Remote': 1, 'Possible': 2, 'Indeterminate': 3, 'Doubtful': 4, 'Critical': 5}
    malig_map = {'Remote': 1, 'Possible': 2, 'Doubtful': 3, 'Critical': 4}
    gt_diameters = []
    predicted_diameters = []

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Diretório para salvar o melhor modelo
    checkpoint_dir = "./checkpoints2"
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    model = Blip2ForConditionalGeneration.from_pretrained(
        checkpoint_dir,
        device_map="auto",
        torch_dtype=torch.float32
    )
    
    adapted_vit3d = ViT3DAdapter().to(device)
    model.vision_model = adapted_vit3d
    # Colocar o modelo em modo de avaliação, se necessário
    
    model.eval()
    
    validation_loss = 0
    total_val_samples = 0
    correct = 0
    total = 0
    
    precisions = []
    recalls = []
    f1s = []

    rouge_1 = []
    rouge_2 = []
    rouge_l = []
    
    
    with torch.no_grad():
        for idx, batch in enumerate(test_dataloader):
            input_ids = batch.pop("input_ids").to(device)
            embds = batch.pop("embds").to(device, torch.float32)
            attention_mask = batch.pop("attention_mask").to(device)
            # original_text = batch.pop("original_text")
            model.vision_model.training = True
            # outputs = model(input_ids=input_ids, pixel_values=embds, labels=input_ids)
            outputs = model(input_ids=input_ids, pixel_values=embds, labels=input_ids)
    
            loss = outputs.loss
            validation_loss += loss.item()
            total_val_samples += len(input_ids)
            model.vision_model.training = False
            generated_ids = model.generate(pixel_values=embds, temperature=0.0, do_sample=False, max_length=100)
            generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)
            original_text = processor.batch_decode(input_ids, skip_special_tokens=True)
            
            for original, generated in zip(original_text, generated_caption):

                print("Original Text -->", original)
                print("\n")
                print("Generated Text -->",generated)
                print("\n")

                try:
                    malignancy, diameter = get_features(original)
                    malignancy_, diameter_ = get_features(generated)
                    
                    # original_value = original_features['malignancy']
                    # predicted_value = predicted_features['malignancy']
                                
                    malig_gt_labels.append(malig_map[malignancy])
                    malig_pred_labels.append(malig_map[malignancy_])
    
                    gt_diameters.append(diameter)
                    predicted_diameters.append(diameter_)
                    
                except:
                    print("Wrong legend...")
                    continue                
                
                # P, R, F1 = score([generated], [original], lang="en", verbose=True, device=device)
            
                # print("Precision:", P)
                # print("Recall:", R)
                # print("F1:", F1)
    
                # precisions.append(P)
                # recalls.append(R)
                # f1s.append(F1)

                # # Inicializa o calculador de ROUGE com ROUGE-S (skip-bigram)
                # scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
                
                # # Calcula as pontuações ROUGE-L, que inclui ROUGE-S (skip-bigram)
                # scores = scorer.score(original, generated)
                
                # # Exibe a pontuação ROUGE-S (que faz parte de ROUGE-L)
                # print("ROUGE-S (Skip-Bigram):")
                # print("Precisão:", scores['rougeL'].precision)
                # print("Abrangência:", scores['rougeL'].recall)
                # print("F1-Score:", scores['rougeL'].fmeasure)


                # Inicializa o calculador de ROUGE com as métricas desejadas
                scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
                
                # Calcula as pontuações ROUGE
                scores = scorer.score(original, generated)
                
                # Exibe as pontuações
                print("ROUGE-1:", scores['rouge1'])
                print("ROUGE-2:", scores['rouge2'])
                print("ROUGE-L:", scores['rougeL'])
            
                print("\n")

                rouge_1.append(scores['rouge1'])
                rouge_2.append(scores['rouge2'])
                rouge_l.append(scores['rougeL'])
                
    # print("Precision mean: ", torch.mean(precisions))
    # print("Recall mean: ", torch.mean(recalls))
    # print("F1 mean: ", torch.mean(f1s))

Loading checkpoint shards: 100%|██████████| 8/8 [00:04<00:00,  1.70it/s]


Original Text --> Obvious lung nodule, composed of soft tissue, without calcification, with an ovoid shape, poorly defined margin, medium lobulation, marked spiculation, solid texture, critical for malignancy, measuring 21.83mm in diameter.


Generated Text --> Obvious lung nodule with soft tissue internal structure, no calcification, ovoid shape, medium margin, medium lobulation, and medium spiculation. The texture is solid. The nodule is critical for malignancy and measures 22.9 mm in diameter.


ROUGE-1: Score(precision=0.5833333333333334, recall=0.7, fmeasure=0.6363636363636365)
ROUGE-2: Score(precision=0.2571428571428571, recall=0.3103448275862069, fmeasure=0.28125)
ROUGE-L: Score(precision=0.5277777777777778, recall=0.6333333333333333, fmeasure=0.5757575757575758)


Original Text --> An obvious lung nodule with soft tissue internal structure, solid calcification, and ovoid/round sphericity. The margin is sharp, with no lobulation or spiculation. The texture is solid. Given these 

In [15]:
import  numpy as np
print(f"Rouge 1 - Mean  {np.mean([f1.fmeasure for f1 in rouge_1])}")
print(f"Rouge 2 - Mean  {np.mean([f1.fmeasure for f1 in rouge_2])}")
print(f"Rouge L - Mean  {np.mean([f1.fmeasure for f1 in rouge_l])}")

# Definir os pesos para cada distância
#weights = {0: 1.0, 1: 0.75, 2: 0.5, 3: 0.25, 4:0.125}
weights = {0: 1.0, 1: 0.75, 2: 0.5, 3: 0.25}

# Calcular a acurácia ponderada
weighted_accuracy = sum([weights.get(abs(true - pred), 0) for true, pred in zip(malig_gt_labels, malig_pred_labels)]) / len(malig_gt_labels)

print("Acurácia Ponderada:", weighted_accuracy)


Rouge 1 - Mean  0.6854513273071048
Rouge 2 - Mean  0.4046208218766064
Rouge L - Mean  0.6168964603543653
Acurácia Ponderada: 0.7764084507042254


In [16]:
from sklearn.metrics import classification_report
import pandas as pd


# Rótulos personalizados para as classes
target_names = ['Remote', 'Possible', 'Doubtful', 'Critical']

# Gera o relatório de classificação com rótulos personalizados
report = classification_report(malig_gt_labels, malig_pred_labels, target_names=target_names, output_dict=True)

# Converte para um DataFrame e renomeia as colunas
report_df = pd.DataFrame(report).transpose()
report_df.columns = ['Precisão', 'Revocação', 'F1-Score', 'Suporte']

print(report_df)

              Precisão  Revocação  F1-Score     Suporte
Remote        0.000000   0.000000  0.000000   34.000000
Possible      0.466667   0.636364  0.538462   44.000000
Doubtful      0.142857   0.027778  0.046512   36.000000
Critical      0.360000   0.964286  0.524272   28.000000
accuracy      0.394366   0.394366  0.394366    0.394366
macro avg     0.242381   0.407107  0.277311  142.000000
weighted avg  0.251804   0.394366  0.282016  142.000000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
import math

def rmse(y_true, y_pred):
    
    # Passo 1: Calcular o erro quadrado para cada par de valores
    squared_errors = [(y - y_hat) ** 2 for y, y_hat in zip(y_true, y_pred)]
    
    # Passo 2: Calcular a média dos erros quadrados (MSE)
    mse = sum(squared_errors) / len(y_true)
    
    # Passo 3: Tirar a raiz quadrada do MSE para obter o RMSE
    rmse = math.sqrt(mse)
    
    return rmse


In [18]:
# Calcula o RMSE médio
resultado = rmse(gt_diameters, predicted_diameters)
print(f"RMSE: {resultado} mm")


RMSE: 2.91852378922812 mm


In [19]:

for idx, (gt, pre) in enumerate(zip(gt_diameters, predicted_diameters)):
    print(f"{idx+1}:           \tGt: {gt:.2f},           \tPred: {pre:.2f},          \t Error: {abs(gt-pre):.2f} mm")

1:           	Gt: 21.83,           	Pred: 22.90,          	 Error: 1.07 mm
2:           	Gt: 5.97,           	Pred: 5.90,          	 Error: 0.07 mm
3:           	Gt: 3.81,           	Pred: 5.90,          	 Error: 2.09 mm
4:           	Gt: 4.73,           	Pred: 8.90,          	 Error: 4.17 mm
5:           	Gt: 12.55,           	Pred: 12.90,          	 Error: 0.35 mm
6:           	Gt: 10.07,           	Pred: 14.90,          	 Error: 4.83 mm
7:           	Gt: 13.05,           	Pred: 7.90,          	 Error: 5.15 mm
8:           	Gt: 12.80,           	Pred: 14.90,          	 Error: 2.10 mm
9:           	Gt: 13.05,           	Pred: 12.90,          	 Error: 0.15 mm
10:           	Gt: 7.02,           	Pred: 5.90,          	 Error: 1.12 mm
11:           	Gt: 4.70,           	Pred: 5.90,          	 Error: 1.20 mm
12:           	Gt: 25.42,           	Pred: 17.90,          	 Error: 7.52 mm
13:           	Gt: 4.90,           	Pred: 5.90,          	 Error: 1.00 mm
14:           	Gt: 21.34,         