In [1]:
%%bash
pip install -q transformers[torch]
pip install -q evaluate
pip install -q wandb
pip install -q datasets==2.14.5
pip install -q huggingface-hub
pip install -q deepspeed
pip install -q loguru
pip install -q Pillow
pip install -q rouge_score

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.11.0 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.11.0 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2023.11.0 which is incompatible.
dask-cuda 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.11.0 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0

In [2]:
import os
import json
import datasets
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from transformers import (Seq2SeqTrainer,Seq2SeqTrainingArguments,
                          AutoTokenizer,GPT2Config, default_data_collator,
                          VisionEncoderDecoderModel , ViTFeatureExtractor)
from huggingface_hub import login



In [3]:
content_file = open("/kaggle/input/flickr30k/captions.txt").readlines()

In [4]:
content_file.pop(0)

'image,caption\n'

In [5]:
def remove_special_character(text):
    text = text.replace("'", "")
    text = text.replace('"', '')
    return text

In [6]:
for i in tqdm(range(len(content_file))):
    content_file[i] = remove_special_character(content_file[i])

100%|██████████| 158915/158915 [00:00<00:00, 891830.18it/s]


In [7]:
data = [tuple(file.split(', ', 1)) for file in content_file]

In [8]:
df = pd.DataFrame(data, columns=['image', 'caption'])

In [9]:
df = df.dropna()

In [10]:
df = df.drop_duplicates(subset=['image'])

In [11]:
train_df = df.iloc[: int(0.8 * len(df))]
valid_df = df.iloc[int(0.8 * len(df)) : int(0.8 * len(df)) + int(0.1 * len(df))]
test_df = df.iloc[int(0.8 * len(df)) + int(0.1 * len(df)) : ]

In [12]:
os.environ["HUGGINGFACE_TOKEN"] = "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"
os.environ["WANDB_KEY"] = "45883d116d879df59569bd98b2cffb64bc20c0c6"
os.environ["WANDB_PROJECT"] = "image-captioning"

In [13]:
class DatasetArguments:
    def __init__(self, kwargs) -> None:
        for key, value in kwargs.items():
            setattr(self, key, value)

In [14]:
transforms = transforms.Compose(
    [
        transforms.Resize((224,224)), 
        transforms.ToTensor(),
   ]
)

In [15]:
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

AutoTokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens

In [16]:
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.unk_token

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [17]:
train_dataset_arguments = DatasetArguments({
    "data_directory": "/kaggle/input/flickr30k/Images",
    "mapping_df": train_df,
    "max_length":82,
    "feature_extractor":feature_extractor,
    "tokenizer":tokenizer,
    "transform":None
})

In [18]:
valid_dataset_arguments = DatasetArguments({
    "data_directory": "/kaggle/input/flickr30k/Images",
    "mapping_df": valid_df,
    "max_length":82,
    "feature_extractor":feature_extractor,
    "tokenizer":tokenizer,
    "transform":None
})

In [19]:
class ImageDataset(Dataset):
    def __init__(self, arguments: DatasetArguments):
        self.arguments = arguments
        self.df = self.arguments.mapping_df
        self.feature_extractor = self.arguments.feature_extractor
        self.tokenizer = self.arguments.tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        caption = self.df.iloc[index]['caption']
        image_path = os.path.join(self.arguments.data_directory, 
                                  self.df.iloc[index]['image'])
        image = Image.open(image_path).convert("RGB")
        
        if self.arguments.transform is not None:
            image = self.arguments.transform(image)
            
        pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values
        tokenized_caption = self.tokenizer(caption,
                                 padding='max_length',
                                 max_length=self.arguments.max_length,
                                truncation=True).input_ids
        tokenized_caption = [caption if caption != self.tokenizer.pad_token_id else -100 for caption in tokenized_caption]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(tokenized_caption)}
        return encoding

In [20]:
train_dataset = ImageDataset(train_dataset_arguments)

In [21]:
valid_dataset = ImageDataset(valid_dataset_arguments)

In [22]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained("google/vit-base-patch16-224", 
                                                                  "gpt2")

Downloading config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.7.ln_cross_attn.bias', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.4.crossattention.q_attn.bias', 'h.6.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.3.crossattention.c_attn.weight', 'h.10.crossattention.c_attn.weight', 'h.2.crossattention.c_attn.weight', 'h.7.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.1.crossattention.c_proj.weight', 'h.6.crossattention.c_proj.bias', 'h.4.ln_cross_attn.weight', 'h.8.ln_cross_attn.bias', 'h.7.crossattention.c_attn.bias', 'h.10.crossattention.c_proj.weight', 'h.3.crossattention.c_proj.weight', 'h.4.ln_cross_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.6.ln_cross_attn.bias', 'h.6.crossattention.c_attn.bias', 'h.0.ln_cross_attn.bias', 'h.7.crossattention.c_attn.weight', 'h.4.crossattention.c_attn.bias', 'h.8.crossattention.q_attn.weight', 'h.1.crossattention.q_att

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [23]:
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 50
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir='sport-image-captioning',
    per_device_train_batch_size=16,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    logging_steps=10,  
    save_steps=200, 
    warmup_steps=500,  
    learning_rate = 2e-5,
    num_train_epochs = 5,
    save_total_limit=1,
    report_to="wandb",
    push_to_hub=True
)

In [25]:
wandb.login(key="45883d116d879df59569bd98b2cffb64bc20c0c6")
wandb.init(entity="9h53-sportivefy", name="vit-gpt2")
login("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", add_to_git_credential=True)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhungsvdut[0m ([33m9h53-sportivefy[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231219_121731-0ev0uzax[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvit-gpt2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/9h53-sportivefy/image-captioning[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/9h53-sportivefy/image-captioning/runs/0ev0uzax[0m


Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [26]:
trainer = Seq2SeqTrainer(
    tokenizer=feature_extractor,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=default_data_collator,
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.8044,2.802753
2,2.6103,2.709045
3,2.5352,2.67598
4,2.3521,2.670722
5,2.423,2.673084




TrainOutput(global_step=3975, training_loss=2.6307786168992147, metrics={'train_runtime': 8326.758, 'train_samples_per_second': 15.268, 'train_steps_per_second': 0.477, 'total_flos': 2.2942374478101873e+19, 'train_loss': 2.6307786168992147, 'epoch': 5.0})