In [None]:
  !pip install datasets



In [None]:
!pip install peft



In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from transformers import ViTFeatureExtractor, VisionEncoderDecoderModel, AutoProcessor, ViTImageProcessor

from datasets import load_dataset
from peft import LoraConfig, get_peft_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
dataset = load_dataset("tomytjandra/h-and-m-fashion-caption-12k")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from torch.utils.data import Dataset
from torchvision import transforms

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        text = sample["text"]
        image = sample["image"]

        # Check if image is a PIL Image object
        if isinstance(image, Image.Image):
            # Apply transformations if provided
            if self.transform:
                image = self.transform(image)
        else:
            print("Error: Unexpected image format.")
            return None

        return image, text


In [None]:
from datasets import Dataset, DatasetDict

# Assuming `dataset` is the original dataset loaded as a DatasetDict
dataset_dict = dataset['train']

# Set a seed for reproducibility
seed = 42

# Perform the split
dataset_split = dataset_dict.train_test_split(test_size=0.2, seed=seed)

# Separate the split datasets
train_data = dataset_split['train']
val_data = dataset_split['test']

# Check the number of samples
print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")


Training data size: 9949
Validation data size: 2488


In [None]:
# Create an instance of the dataset with transformations
transform = transforms.Compose([
    # Add your desired transformations here
    transforms.Resize((224, 224)),  # Example transformation
    transforms.ToTensor(),
    transforms.RandomHorizontalFlip(),
])
train_dataset = ImageCaptioningDataset(train_data, transform=transform)
val_dataset = ImageCaptioningDataset(val_data, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=30, shuffle=False)


In [None]:
from transformers import pipeline, AutoTokenizer, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader

# Assuming device is defined somewhere above
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained pipeline model
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
pipe.model.to(device)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-base")

# Define your loss function (e.g., cross-entropy loss)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Define your optimizer
optimizer = torch.optim.Adam(pipe.model.parameters(), lr=5e-5)

# Calculate total training steps
epochs = 5
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
# Early stopping criteria
best_val_loss = float("inf")
patience = 3
patience_counter = 0

# Training loop
epochs = 5
for epoch in range(epochs):
    pipe.model.train()
    total_loss = 0.0
    for batch_idx, batch in enumerate(train_loader):
        images, texts = batch
        images = images.to(device)

        tokenized_texts = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt")
        input_ids = tokenized_texts.input_ids.to(device)
        labels = tokenized_texts.input_ids.to(device).clone()

        labels[:, :-1] = labels[:, 1:].clone()
        labels[:, -1] = tokenizer.pad_token_id
        attention_mask = tokenized_texts.attention_mask.to(device)

        outputs = pipe.model(pixel_values=images, input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits.view(-1, outputs.logits.size(-1))
        labels = labels.view(-1)
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}")

    # Validation loop
    pipe.model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            images, texts = batch
            images = images.to(device)

            tokenized_texts = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt")
            input_ids = tokenized_texts.input_ids.to(device)
            labels = tokenized_texts.input_ids.to(device).clone()

            labels[:, :-1] = labels[:, 1:].clone()
            labels[:, -1] = tokenizer.pad_token_id
            attention_mask = tokenized_texts.attention_mask.to(device)

            outputs = pipe.model(pixel_values=images, input_ids=input_ids, attention_mask=attention_mask)

            logits = outputs.logits.view(-1, outputs.logits.size(-1))
            labels = labels.view(-1)
            loss = loss_fn(logits, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Validation Loss: {avg_val_loss:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

    # Step the scheduler
    scheduler.step()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/5], Batch [0/332], Loss: 5.6277
Epoch [1/5], Batch [10/332], Loss: 2.7620
Epoch [1/5], Batch [20/332], Loss: 2.1261
Epoch [1/5], Batch [30/332], Loss: 1.9173
Epoch [1/5], Batch [40/332], Loss: 1.7266
Epoch [1/5], Batch [50/332], Loss: 1.6254
Epoch [1/5], Batch [60/332], Loss: 1.3636
Epoch [1/5], Batch [70/332], Loss: 1.5087
Epoch [1/5], Batch [80/332], Loss: 1.3350
Epoch [1/5], Batch [90/332], Loss: 1.4003
Epoch [1/5], Batch [100/332], Loss: 1.2501
Epoch [1/5], Batch [110/332], Loss: 1.3001
Epoch [1/5], Batch [120/332], Loss: 1.2976
Epoch [1/5], Batch [130/332], Loss: 1.3262
Epoch [1/5], Batch [140/332], Loss: 1.1560
Epoch [1/5], Batch [150/332], Loss: 1.2324
Epoch [1/5], Batch [160/332], Loss: 1.1827
Epoch [1/5], Batch [170/332], Loss: 1.2151
Epoch [1/5], Batch [180/332], Loss: 1.1972
Epoch [1/5], Batch [190/332], Loss: 1.2164
Epoch [1/5], Batch [200/332], Loss: 1.0743
Epoch [1/5], Batch [210/332], Loss: 1.1509
Epoch [1/5], Batch [220/332], Loss: 1.0999
Epoch [1/5], Batch [23

In [None]:
pipe.model.save_pretrained("/content/drive/MyDrive/Project_4/pipe")
tokenizer.save_pretrained("/content/drive/MyDrive/Project_4/token")


('/content/drive/MyDrive/Project_4/token/tokenizer_config.json',
 '/content/drive/MyDrive/Project_4/token/special_tokens_map.json',
 '/content/drive/MyDrive/Project_4/token/vocab.txt',
 '/content/drive/MyDrive/Project_4/token/added_tokens.json',
 '/content/drive/MyDrive/Project_4/token/tokenizer.json')

In [None]:
pipe.model

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=0759b4d1bffb3c3c9ade42745d9e579a4cda16c390392ed7ec0bc0aec34cca2a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from datasets import load_metric
# Define the evaluation metrics
bleu = load_metric("bleu")
meteor = load_metric("meteor")
rouge = load_metric("rouge")

def compute_metrics(predictions, references):
    results = {
        "bleu": bleu.compute(predictions=predictions, references=references),
        "meteor": meteor.compute(predictions=predictions, references=references),
        "rouge": rouge.compute(predictions=predictions, references=references),
    }
    return results


# Generate predictions and references and compute validation loss
pipe.model.eval()
total_val_loss = 0.0
predictions = []
references = []
val_predictions = []

with torch.no_grad():
    for batch_idx, batch in enumerate(val_loader):
        images, texts = batch
        images = images.to(device)

        # Tokenize texts
        tokenized_texts = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt")

        # Shift input_ids and labels for language modeling
        input_ids = tokenized_texts.input_ids.to(device)
        labels = tokenized_texts.input_ids.to(device).clone()
        labels[:, :-1] = labels[:, 1:].clone()
        labels[:, -1] = tokenizer.pad_token_id
        attention_mask = tokenized_texts.attention_mask.to(device)

        # Forward pass
        outputs = pipe.model(pixel_values=images, input_ids=input_ids, attention_mask=attention_mask)

        # Compute the loss
        logits = outputs.logits.view(-1, outputs.logits.size(-1))
        labels = labels.view(-1)
        loss = loss_fn(logits, labels)
        total_val_loss += loss.item()

        # Collect predictions
        pred_texts = tokenizer.batch_decode(torch.argmax(outputs.logits, dim=-1), skip_special_tokens=True)
        val_predictions.extend(zip(texts, pred_texts))

        # For metric calculation
        predictions.extend(pred_texts)
        references.extend([[text] for text in texts])  # BLEU expects a list of references for each prediction

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Print a few validation examples
    for i, (true_text, pred_text) in enumerate(val_predictions[:5]):  # Display first 5 examples
        print(f"Example {i+1}")
        print(f"True: {true_text}")
        print(f"Pred: {pred_text}")

# Compute and print the evaluation metrics
metrics = compute_metrics(predictions, references)
print(metrics)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Validation Loss: 0.9034
Example 1
True: solid black trousers in a crepe weave with a low ribbed waistband and straight wide legs
Pred: solid black trousers in a softepe weave with a high waistbed waistband and wide wide legs with wide jerseys jersey - - - - jerseysrrsesrersseserssrerserssrsrererersseer
Example 2
True: solid black jumper in a soft fine-knit viscose blend with a deep v-neck long sleeves and ribbing around the neckline cuffs and hem
Pred: solid black jumper in a soft fine knit knit viscose blend with a v v - neck long sleeves and ribbing around the neckline cuffs and hem rerer r knit r fibre r lining fibre rer rer longer lining r longer r longerie r r rer r
Example 3
True: solid green ankle-length kaftan in a crepe weave with flounces down the sides and a v-neck with a wide flounced trim rounded hem with a flounced trim and short slits in the sides slightly shorter at the front
Pred: solid green short - length kaftan in a crepe weave with aounces down the front and a v - 

ValueError: Got a string but expected a list instead: 'solid black trousers in a softepe weave with a high waistbed waistband and wide wide legs with wide jerseys jersey - - - - jerseysrrsesrersseserssrerserssrsrererersseer'