# Import Essential libraries

In [1]:
import os
import json
import os
import pathlib
from typing import Generator, List
import pandas as pd
import subprocess

# Body

## Convert DataFrame to JSON $\to$ compatible with Huggingface Dataset

In [2]:
data = pd.read_csv("../Datasets/train.csv")
data.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [8]:
# Function to collect image paths and their corresponding titles from a dataset.
def collect_captioned_images(
    data: List[dict],
    root_folder: str,
):
    for row in data:
        image_name = row["image"]
        title = row["title"]

        image_path = os.path.join(root_folder, image_name)

        yield image_path, title

# Function to convert the collected image and text pairs into a JSON format suitable for Hugging Face datasets.
def convert_text_image_pairs_to_huggingface_json(root_folder, out_json):
    out_folder = os.path.dirname(root_folder)
    pathlib.Path(out_folder).mkdir(parents=True, exist_ok=True)
    with open(out_json, "w") as f:
        written_count = 0
        for image_path, caption in collect_captioned_images(
            data.to_dict(orient="records"), root_folder
        ):
            line_dict = {"image": image_path, "caption": caption}
            json_line = json.dumps(line_dict, indent=None, separators=(",", ":"))
            # print(json_line)
            f.write(json_line + "\n")
            written_count += 1
        print(f"wrote {written_count} lines to {out_json}")

In [10]:
root_folder = "../Datasets/train_images/"
out_json = "../Datasets/train_data.json"

convert_text_image_pairs_to_huggingface_json(
    root_folder=root_folder,
    out_json=out_json,
)

wrote 34250 lines to ./data/train_data.json


## Start training process

In [11]:
repo_id = "openai/clip-vit-base-patch16"
output_folder = "./cached/clip-finetuned"
batch_size = 16
num_train_epochs = 10

In [12]:
print(f"Finetuning {repo_id} for {num_train_epochs} epochs with batch size {batch_size}, and then saving output to {output_folder}.")

Finetuning openai/clip-vit-base-patch16 for 10 epochs with batch size 16, and then saving output to ./cached/clip-finetuned.


In [None]:
!python fine_tune_clip.py \
    --output_dir {output_folder} \
    --model_name_or_path {repo_id} \
    --train_file {out_json} \
    --image_column image \
    --overwrite_output_dir=True \
    --max_seq_length=77 \
    --num_train_epochs={num_train_epochs} \
    --caption_column caption \
    --remove_unused_columns=False \
    --do_train \
    --per_device_train_batch_size={batch_size} \
    --learning_rate="5e-5" \
    --warmup_steps="0" \
    --weight_decay 0.1


In [None]:
print("--\nDONE")
print(f"If it worked, trained data should be in {output_folder}")