### Method 1 (Sequential)

In [10]:
import os
import csv
from PIL import Image
from transformers import pipeline
from tqdm import tqdm

In [11]:
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device='cuda')

In [12]:
def generate_caption(image_path):
    """
    Generate a caption for an image.
    """
    # Open the image
    img = Image.open(image_path)
    
    # Generate the caption
    result = pipe(img, max_new_tokens=75)
    return result[0]['generated_text']

In [13]:
def save_to_csv(data, filename='captions.csv'):
    """
    Save data to a CSV file.
    """
    fieldnames = ['Image Path', 'Caption']
    try:
        with open(filename, mode='w') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            for row in data:
                writer.writerow(row)
    except FileExistsError:
        print(f"File {filename} already exists.")

In [14]:
def process_images(directory='images'):
    """
    Process all images in a directory and save captions to a CSV file.
    """
    data = []
    for filename in tqdm(os.listdir(directory), desc="Processing Images"):
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(directory, filename)
            caption = generate_caption(image_path)
            data.append({'Image Path': image_path, 'Caption': caption})
    
    save_to_csv(data)

In [None]:
if __name__ == "__main__":
    process_images()

Processing Images:  52%|████████████████████████████████████████████████████████████████████████████▎                                                                     | 116/222 [00:35<00:33,  3.17it/s]

In [7]:
import torch 
if torch.cuda.is_available():
    print("CUDA is avalible!")
else:
    print("CUDA is not available!")

CUDA is avalible!


### Method 2 (via Datasets)

In [9]:
import os
import torch
from PIL import Image
from transformers import pipeline
from datasets import Dataset, concatenate_datasets
from tqdm import tqdm
import pandas as pd

# Initialize the image-to-text pipeline
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device='cuda')

def generate_caption(image_path):
    """
    Generate a caption for an image.
    """
    # Open the image
    img = Image.open(image_path)
    
    # Generate the caption
    result = pipe(img, max_new_tokens=75)
    return result[0]['generated_text']

def process_image_dataset(dataset):
    """
    Process a dataset of images and generate captions.
    """
    def gen_captions(example):
        example['caption'] = generate_caption(example['path'])
        return example
    
    processed_dataset = dataset.map(gen_captions)

    print(processed_dataset.take(1))
    
    return processed_dataset

def save_to_csv(data, filename='metadata.csv'):
    """
    Save data to a CSV file.
    """
    fieldnames = ['file_name', 'text']
    try:
        with open(filename, mode='w') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            for row in data:
                writer.writerow(row)
    except FileExistsError:
        print(f"File {filename} already exists.")

def process_images(directory='images'):
    """
    Process all images in a directory and save captions to a CSV file.
    """
    # Create a dataset from the directory
    dataset = Dataset.from_pandas(pd.DataFrame({
        'path': [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(('.jpg', '.jpeg', '.png'))]
    }))
    
    # Process the dataset
    processed_dataset = process_image_dataset(dataset)

    
    # Convert the processed dataset back to a list of dicts
    data = [{'file_name': item['path'], 'text': item['caption']} for item in processed_dataset]
    
    # Save to CSV
    save_to_csv(data)

if __name__ == "__main__":
    process_images()


Map:   0%|          | 0/221 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'caption'],
    num_rows: 1
})
