# BLIP: Bulk Caption Images Automatically
 - Run each step one after other
 - Give your Google Drive permission when asked in 1 step
 - create a folder named "my_images" in your Google Drive
 - Upload images you want to caption in "my_images" folder
 - Image captions will we saved in "my_captions" folder in your Google Drive
 - Caption for each image will be saved as a text file of same name as the    image inside "my_captions" folder

# 1.Insall Required dependencies

In [1]:
!pip3 install transformers==4.30.0 timm==0.4.12 fairscale==0.4.4
!git clone https://github.com/salesforce/BLIP
%cd BLIP

Cloning into 'BLIP'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 277 (delta 137), reused 136 (delta 135), pack-reused 112[K
Receiving objects: 100% (277/277), 7.03 MiB | 12.98 MiB/s, done.
Resolving deltas: 100% (152/152), done.
/content/BLIP


#2. Get images
Upload your images to "my_images" folder in your Google Drive.

In [2]:
from PIL import Image
import os
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_demo_images(image_folder, seasons, image_size, device):
    images = []
    image_names = []
    for season in seasons:
        abs_path = os.path.join(image_folder, season)
        count = 0
        for image_name in os.listdir(abs_path):
            if count == 1500:
                print(f"{season} images loaded")
                break
            img_path = os.path.join(abs_path, image_name)
            raw_image = Image.open(img_path).convert('RGB')

            # w, h = raw_image.size
            # display(raw_image.resize((w//5, h//5)))

            transform = transforms.Compose([
                transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
                transforms.ToTensor(),
                transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
            ])
            image = transform(raw_image).unsqueeze(0).to(device)
            images.append(image)
            image_names.append(image_name)
            count += 1
    return images, image_names

# 3. Image Captioning
- Perform image captioning using finetuned BLIP model.
- Result will be saved in "my_captions" folder in your Google Drive

In [3]:
from models.blip import blip_decoder

image_size = 512
images, image_names = load_demo_images('/content/season_images', ['spring', 'summer'], image_size=image_size, device=device)

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'
model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base')
model.eval()
model = model.to(device)

text_folder = '/content'
output_file = 'season_captions.txt'
output_file_path = os.path.join(text_folder, output_file)

with open(output_file_path, "a+") as f:
    with torch.no_grad():
        for i, image in enumerate(images):
            caption = model.generate(image, sample=True, num_beams=3, max_length=40, min_length=5)
            print('caption: '+caption[0])
            image_name = image_names[i]
            f.write(f"{image_name}, {caption[0]}\n")

spring images loaded
summer images loaded
reshape position embedding from 196 to 1024
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth
caption: an old bridge over water
caption: the city from the water
caption: a white flower on green leaves
caption: some flowers hanging from the tree
caption: sakura blossoms growing in my parents garden
caption: some white flowers
caption: a tree with pink flowers
caption: the waves on the ocean
caption: the flowers and shrubs in bloom
caption: my wife, under the pink blossoms
caption: pink flowers on a field
caption: a white flower in a garden
caption: the branches and flowers on the trees
caption: istanbul from the water
caption: a cup and two flowers
caption: a tree with a lot of purple flowers
caption: a flower with a bee on it
caption: some white flowers on a tree
caption: little girl with her eyes closed in the middle of the field of flowers
caption: blue flowers growing in

In [3]:
# Run below code after restarting the session
from models.blip import blip_decoder

image_size = 512
images, image_names = load_demo_images('/content/season_images', ['fall', 'winter'], image_size=image_size, device=device)

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'
model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base')
model.eval()
model = model.to(device)

text_folder = '/content'
output_file = 'season_captions.txt'
output_file_path = os.path.join(text_folder, output_file)

with open(output_file_path, "a+") as f:
    with torch.no_grad():
        for i, image in enumerate(images):
            caption = model.generate(image, sample=True, num_beams=3, max_length=40, min_length=5)
            print('caption: '+caption[0])
            image_name = image_names[i]
            f.write(f"{image_name}, {caption[0]}\n")

fall images loaded
winter images loaded
reshape position embedding from 196 to 1024
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth
caption: the outside of a building with the top section of a parking lot to the left
caption: fall leaves and weeds along the trail
caption: the leaf in autumn
caption: the dirt road leading to a small hill and village
caption: vineyards in autumn with hills in the background
caption: a city from the outside
caption: a menu and magazine
caption: a bunch of different types of pumpkins
caption: the river with water reflections
caption: a leaf
caption: a fall scene with colorful foliage
caption: the green lake, surrounded by colorful autumn foliage, from above
caption: a tree that is blowing on the wind
caption: someone holding a leaf on their left hand
caption: a courtyard with benches, potted flower pots and plants in the ground
caption: a man with his head in the air looking seriousl

In [8]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()

# # Restart the runtime session to use cuda again

