In [None]:
!pip install -U accelerate
!pip install -U transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install rouge_score
!pip install evaluate

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from transformers import AutoTokenizer
import os
from tqdm.auto import tqdm

In [None]:
# Load the captions data
text_data = pd.read_csv(r"C:\Users\jeeva\Downloads\archive\captions.txt", sep='|')
text_data.rename(columns={"comment_number":"image_repeat","comment":"caption"},inplace=True)
text_data_train = text_data
text_data_train.head()

In [None]:
# Create the ImageCaptionDataset
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

image_height = 224
image_width = 224

In [None]:
class ImageCaptionDataset(Dataset):
    def __init__(self, image_path, text_path, tokenizer, image_height,image_width,image_processor ):
        self.image_path = image_path
        self.text_path = text_path
        self.tokenizer = tokenizer
        self.image_height = image_height
        self.image_width = image_width
        self.image_processor=image_processor 
    def __len__(self):
        return len(os.listdir(self.image_path))
    
    def __getitem__(self, index):
        image_file = os.listdir(self.image_path)[index]
        image_path = os.path.join(self.image_path, image_file)
        image = Image.open(image_path).convert('RGB')
        image = self.image_processor(images=[image], return_tensors='pt').pixel_values
        
        text_file = os.listdir(self.text_path)[index]
        text_path = os.path.join(self.text_path, text_file)
        with open(text_path, 'r') as f:
            caption = f.read().strip()
        caption_token = self.tokenizer(caption, add_special_tokens=True, padding="max_length", max_length=32, truncation=True).input_ids
        caption_token = [token if token != self.tokenizer.pad_token_id else -100 for token in caption_token]
        return {"pixel_values": image.squeeze(), "caption_token": torch.tensor(caption_token)}

def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    caption_tokens = [item["caption_token"] for item in batch]
    
    pixel_values = pad_sequence(pixel_values, batch_first=True)
    caption_tokens = pad_sequence(caption_tokens, batch_first=True, padding_value=decoder_tokenizer.pad_token_id)
    
    return {"pixel_values": pixel_values, "caption_token": caption_tokens}

In [None]:
# Save the preprocessed data
import os
image_folder = '/kaggle/input/preprocessed_images/'
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

text_folder = '/kaggle/input/preprocessed_captions/'
if not os.path.exists(text_folder):
    os.makedirs(text_folder)

In [None]:
image_files = os.listdir(image_folder)
text_files = os.listdir(text_folder)

progress_bar = tqdm(range(len(image_files)), desc='Preprocessing')
for i, (image_file, text_file) in enumerate(zip(image_files, text_files)):
    image = Image.open(os.path.join(image_folder, image_file)).convert('RGB')
    image.save(os.path.join(image_folder, image_file))
    
    with open(os.path.join(text_folder, text_file), 'w') as f:
        f.write(text_data_train.iloc[i]['caption'])
    
    progress_bar.update(1)

train_dataset = ImageCaptionDataset(image_path=r"C:\Users\jeeva\Downloads\archive\images", text_path=r"C:\Users\jeeva\Downloads\archive\captions.txt", tokenizer=decoder_tokenizer, image_height=224, image_width=224, image_processor=image_processor)