In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install open_clip_torch transformers

In [5]:
import os
import torch
import open_clip
import pandas as pd
from tqdm import tqdm
from PIL import Image
import multiprocessing
import matplotlib.pyplot as plt
from transformers import pipeline

In [6]:
# Initialize the caption generating model COCA
model, _, transform = open_clip.create_model_and_transforms(
  model_name="coca_ViT-L-14",
  pretrained="mscoco_finetuned_laion2B-s13B-b90k"
)


open_clip_pytorch_model.bin:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [8]:
# Initialize the classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
# citation: https://colab.research.google.com/github/mlfoundations/open_clip/blob/master/docs/Interacting_with_open_coca.ipynb?authuser=2#scrollTo=hGbTjj6wY6xm
def generate_caption(img_path):
  im = Image.open(img_path).convert("RGB")
  im = transform(im).unsqueeze(0).to('cuda')

  with torch.no_grad(), torch.cuda.amp.autocast():
    generated = model.generate(im)

  caption = open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", "")
  return caption


In [10]:
def classify(caption):
  # categories idea citation: Multimodal Post Attentive Profiling for Influencer Marketing
  categories = ["travel", "food", "sports", "fashion", "technology","friend","family","beauty","fitness","pet","art","music","interior"]
  result = classifier(caption, candidate_labels=categories)
  return result


In [11]:
def generate_table(image_folder='/content/drive/MyDrive/ML Caption Craft/Likable Images'):
  # Initialize a list to store the data
  data_list = []

  for img_path in tqdm(os.listdir(image_folder)):
      # Check if the file is an image
      if img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
          # Construct the full file path
          full_img_path = os.path.join(image_folder, img_path)

          # Generate caption and classify
          caption = generate_caption(full_img_path)
          result = classify(caption)
          label = result["labels"][0]

          # Append the data as a dictionary to the list
          data_list.append({'path': img_path, 'caption': caption, 'category': label})

  # Create a DataFrame from the accumulated data
  df = pd.DataFrame(data_list)

  return df


In [None]:
df = generate_table()

 42%|████▏     | 10191/24325 [2:34:37<3:25:46,  1.14it/s]