In [1]:
import gc
import torch
import numpy as np
import evaluate
from datasets import load_dataset, DatasetDict
from PIL import Image
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    BlipConfig,
    AutoTokenizer,
    DataCollator,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)



device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"{device.upper()}")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
W1225 20:08:42.341000 73652 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(


In [2]:
checkpoint = "Salesforce/blip-image-captioning-base"
ds = load_dataset("deadprogram/clothes-with-class")

DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 62928
    })
    validation: Dataset({
        features: ['image', 'text'],
        num_rows: 15732
    })
    test: Dataset({
        features: ['image', 'text'],
        num_rows: 8740
    })
})

In [3]:
# original dataset (88400 samples)
dataset = (
    ds
    .rename_column("description", "text")
    .select_columns(["image", "text"])
)

# first, split off test set (e.g., 10%)
split1 = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_val_dataset = split1['train']  # 90%
test_dataset = split1['test']        # 10%

# then, split train_val into train & validation (e.g., 80/20 of 90%)
split2 = train_val_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split2['train']       # 72%
val_dataset = split2['test']          # 18%


cloth_dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})



BlipConfig {
  "architectures": [
    "BlipForConditionalGeneration"
  ],
  "image_text_hidden_size": 256,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "logit_scale_init_value": 2.6592,
  "model_type": "blip",
  "projection_dim": 512,
  "text_config": {
    "initializer_factor": 1.0,
    "model_type": "blip_text_model",
    "num_attention_heads": 12
  },
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vision_config": {
    "dropout": 0.0,
    "initializer_factor": 1.0,
    "initializer_range": 0.02,
    "model_type": "blip_vision_model",
    "num_channels": 3
  }
}



In [4]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
processor.image_processor.size = {"height": 192, "width": 192}

class BlipDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.pad_token_id = processor.tokenizer.pad_token_id

    def __call__(self, batch):
        images = [
            Image.fromarray(x["image"]) if not isinstance(x["image"], Image.Image) else x["image"]
            for x in batch
        ]
        texts = [x["text"] for x in batch]

        encoding = self.processor(
            images=images,
            text=texts,
            padding="max_length",
            truncation=True,
            max_length=48,
            return_tensors="pt"
        )

        labels = encoding.input_ids.clone()
        labels[labels == self.pad_token_id] = -100  # ignore padding in loss

        return {
            "pixel_values": encoding.pixel_values,
            "input_ids": encoding.input_ids,     
            "attention_mask": encoding.attention_mask,
            "labels": labels,
        }


  if not hasattr(np, "object"):


In [5]:
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace -100 so we can decode labels
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)

    decoded_preds = processor.tokenizer.batch_decode(
        preds, skip_special_tokens=True
    )
    decoded_labels = processor.tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )

    return {
        "bleu": bleu.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )["bleu"],
        "meteor": meteor.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )["meteor"],
        "rougeL": rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )["rougeL"],
    }


Downloading builder script: 5.94kB [00:00, 4.54MB/s]
Downloading extra modules: 4.07kB [00:00, 6.09MB/s]                   
Downloading extra modules: 3.34kB [00:00, 5.51MB/s]
Downloading builder script: 7.02kB [00:00, 3.85MB/s]
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1032)>
[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1032)>
[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1032)>
Downloading builder script: 6.14kB [00:00, 6.66MB/s]


In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_steps=500,
    save_total_limit=2,
    logging_steps=100,
    save_strategy="steps",
    remove_unused_columns=False,
    gradient_accumulation_steps=1, 
    predict_with_generate=True,
    generation_max_length=48,
    generation_num_beams=3,
)

In [8]:
mlflow.set_experiment("cloth-finetune-first-experiment")
print("\nTraining model and logging with MLflow...")
with mlflow.start_run():
    model = BlipForConditionalGeneration.from_pretrained(checkpoint)
    # Log the model
    mlflow.transformers.log_model(model, "BlipForConditionalGeneration")
    trainer = Seq2SeqTrainer(
        model,
        training_args,
        train_dataset=cloth_dataset["train"],
        eval_dataset=cloth_dataset["validation"],
        data_collator=BlipDataCollator(processor),
        compute_metrics=compute_metrics
    )
    mlflow.log_params({
        "learning_rate": training_args.learning_rate,
        "train_batch_size": training_args.per_device_train_batch_size,
        "eval_batch_size": training_args.per_device_eval_batch_size,
        "num_train_epochs": training_args.num_train_epochs,
        "eval_steps" : training_args.eval_steps,
        "save_total_limit" : training_args.save_total_limit,
    })
    train_results = trainer.train()
    trainer.save_model()
    trainer.log_metrics("train", train_results.metrics)
    trainer.save_metrics("train", train_results.metrics)
    trainer.save_state()
    # Log custom metrics
    for k in ["bleu", "meteor", "rougeL"]:
        if k in train_results.metrics:
            mlflow.log_metric(k, train_results.metrics[k])

    

  0%|          | 1/94392 [00:09<255:29:57,  9.74s/it]

KeyboardInterrupt: 

In [None]:
import pandas as pd

df = pd.DataFrame(log_history)
df.head()
