# Training the CLIP model on our Skin Rash Dataset

In [1]:
import torch

torch.cuda.empty_cache()

In [4]:
!pip install -q datasets pillow

[33mDEPRECATION: flatbuffers 1.12.1-git20200711.33e2d80-dfsg1-0.6 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of flatbuffers or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


Installing the required version of transformers

In [5]:
# we need v4.26 of transformers - as of writing pip only provides up to v4.25
!pip install -q git+https://github.com/huggingface/transformers
print("--\nDONE")

[33mDEPRECATION: flatbuffers 1.12.1-git20200711.33e2d80-dfsg1-0.6 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of flatbuffers or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
--
DONE


### Preparing the dataset to be compatible for our task

The dataset is provided as a collection of images as .jpg or .jpeg files. For each file, there should be a .txt file with the same name that contains the caption.

In [6]:
import os
import json
import os
import pathlib
from typing import Generator

def collect_captioned_images(root_folder: str) -> Generator[tuple[str,str], None, None]:
    image_paths = []
    captions = []
    
    for directory, _, filenames in os.walk(root_folder):
        image_extensions = ['.jpg', '.jpeg']
        image_filenames = [f for f in filenames if os.path.splitext(f)[1] in image_extensions]
        for image_filename in image_filenames:
            caption_filename = os.path.splitext(image_filename)[0] + '.txt'
            caption_path = os.path.join(directory, caption_filename)
            if not os.path.exists(caption_path):
                continue

            with open(caption_path, 'r') as f:
                caption = f.read().replace('\n', ' ')

                image_path = os.path.join(directory, image_filename)
                yield image_path, caption

                
def convert_text_image_pairs_to_huggingface_json(root_folder, out_json):
    out_folder = os.path.dirname(root_folder)
    pathlib.Path(out_folder).mkdir(parents=True, exist_ok=True)
    with open(out_json, "w") as f:
        written_count = 0
        for image_path, caption in collect_captioned_images(root_folder):
            line_dict = {"image":image_path, "caption":caption}
            json_line = json.dumps(line_dict, indent=None, separators=(",",":"))
            #print(json_line)
            f.write(json_line + "\n")
            written_count += 1
        print(f"wrote {written_count} lines to {out_json}")

This code cell outputs a .json file in a format that huggingface datasets can understand for such a collection of files.

In [7]:
root_folder = "data"
out_json = "output.json"
#convert_text_image_pairs_to_huggingface_json(root_folder, out_json)

Checking if it worked:

In [8]:
# test loading it back in
from datasets import load_dataset
dataset = load_dataset("json", data_files=out_json)
print(f"first image: {dataset['train'][0]['image']}, caption: '{dataset['train'][0]['caption']}'")

first image: data/189.jpg, caption: 'tinea type of rash at the foot area on a fair skin'




<b>Defining the batch size and number of epochs</b>

In [9]:
repo_id =  "openai/clip-vit-large-patch14-336"
output_folder = "clip_finetuned"  # Adjusted to a user directory
batch_size = 10
num_train_epochs = 3

<b>Running the script</b>

In [10]:
import warnings
warnings.filterwarnings('ignore')

print(f"Finetuning {repo_id} for {num_train_epochs} epochs with batch size {batch_size}, and then saving output to {output_folder}.")

!python huggingface_finetune_clip.py \
    --output_dir {output_folder} \
    --model_name_or_path {repo_id} \
    --train_file {out_json} \
    --image_column 'image' \
    --overwrite_output_dir=True \
    --max_seq_length 77 \
    --num_train_epochs {num_train_epochs} \
    --caption_column 'caption' \
    --remove_unused_columns False \
    --do_train \
    --per_device_train_batch_size {batch_size} \
    --learning_rate 5e-5 --warmup_steps 0 --weight_decay 0.1

print("--\nDONE")
print(f"If it worked, trained data should be in {output_folder}")

Finetuning openai/clip-vit-large-patch14-336 for 3 epochs with batch size 10, and then saving output to clip_finetuned.
2024-04-23 20:45:33.092962: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-23 20:45:33.134451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  return self.fget.__get__(instance, owner)()
 does not have profile information (Triggered internally at ./third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
Traceback (most recent call last):
  File "/home/ubuntu/huggingface_f