<a href="https://colab.research.google.com/github/iOS-Forks/Queryable/blob/cloud/CLIPImageModel_to_CoreML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install clip-benchmark>=1.4.0 datasets>=2.8.0 open-clip-torch>=2.20.0 timm>=0.9.5 coremltools


In [None]:
#We clone MobileCLIP from: https://github.com/apple/ml-mobileclip
!git clone https://github.com/apple/ml-mobileclip.git

#Install MobileCLIP
%cd ml-mobileclip
!pip install -e . -q

In [None]:
#Download pretrained checkpoints
%mkdir -p checkpoints
!wget wget https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt -P checkpoints

In [None]:
#Model inference with reparamerized model
import coremltools
import torch
import mobileclip
from mobileclip.modules.common.mobileone import reparameterize_model
from mobileclip.modules.text.tokenizer import (
    ClipTokenizer,
)
from mobileclip.clip import CLIP
from typing import Dict, Optional, Any
import json
import os

# mobileclip_s0
model_cfg = {
    "embed_dim": 512,
    "image_cfg": {
        "image_size": 256,
        "model_name": "mci0"
    },
    "text_cfg": {
        "context_length": 77,
        "vocab_size": 49408,
        "dim": 512,
        "ffn_multiplier_per_layer": 4.0,
        "n_heads_per_layer": 8,
        "n_transformer_layers": 4,
        "norm_layer": "layer_norm_fp32",
        "causal_masking": False,
        "model_name": "mct"
    }
}

class CLIP_encode_image(CLIP):
    """Class for encoding images using the image encoder from CLIP."""

    def __init__(self, cfg: Dict, output_dict: bool = False, *args, **kwargs) -> None:
        super().__init__(cfg, output_dict, *args, **kwargs)

    def forward(self, image: Optional[torch.Tensor] = None) -> Any:
        image_embeddings = (
            self.encode_image(image, normalize=True) if image is not None else None
        )
        return image_embeddings


model_ie = CLIP_encode_image(cfg=model_cfg)
model_ie.eval()

chkpt = torch.load("checkpoints/mobileclip_s0.pt")
model_ie.load_state_dict(chkpt)

reparameterized_model = reparameterize_model(model_ie)
reparameterized_model.eval()


image = torch.rand(1, 3, 256, 256)
traced_model = torch.jit.trace(reparameterized_model, image)

# Define the input as an image type
input_image = coremltools.ImageType(name="input_image", shape=(1, 3, 256, 256), color_layout=coremltools.colorlayout.RGB, scale=1/255.0, bias=[0, 0, 0])
output_tensor = [coremltools.TensorType(name="output_embeddings")]

ml_model = coremltools.convert(
        model=traced_model,
        outputs=output_tensor,
        inputs=[input_image],
        convert_to="mlprogram",
        minimum_deployment_target=coremltools.target.iOS17,
        compute_units=coremltools.ComputeUnit.ALL,
        debug=True,
    )
ml_model.save("clip_mci_image_s0.mlpackage")

In [None]:
#Check CoreML Parameters:
spec = ml_model.get_spec()
print("model type: {}".format(spec.WhichOneof('Type')))
print("model description: {}".format(spec.description))
print("model inputs: {}".format(spec.description.input))
print("model outputs: {}".format(spec.description.output))

In [None]:
#Download mlpackage
import shutil
# from google.colab import files
directory_path = 'clip_mci_image_s0.mlpackage'
zip_path = 'clip_mci_image_s0.mlpackage.zip'
shutil.make_archive(directory_path, 'zip', directory_path)
os.rename(f"{directory_path}.zip", zip_path)
# files.download(zip_path)

In [None]:
!pip install graphviz==0.20.3 torchview==0.2.6
#Optional: Visualize the model
# !pip graphviz torchview
from torchview import draw_graph
import graphviz
graphviz.set_jupyter_format('png')
model_graph = draw_graph(reparameterized_model, input_data = image, expand_nested = True, depth = 5)
model_graph.visual_graph