# AI Technical Test

In [5]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Download an image from the internet
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Prepare inputs for the model
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

# Perform the classification
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Print the probability for each class
print("Probabilities:", probs)

Probabilities: tensor([[0.9949, 0.0051]], grad_fn=<SoftmaxBackward0>)


In [10]:
import tensorflow as tf
from transformers import TFCLIPModel, CLIPProcessor

# Load the pre-trained TFCLIPModel
model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define a new TensorFlow model that outputs only the logits_per_image
class CLIPServingModel(tf.keras.Model):
    def __init__(self, clip_model):
        super(CLIPServingModel, self).__init__()
        self.clip_model = clip_model

    @tf.function(input_signature=[
        {
            "input_ids": tf.TensorSpec(shape=[None, None], dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=[None, None], dtype=tf.int32),
            "pixel_values": tf.TensorSpec(shape=[None, 3, 224, 224], dtype=tf.float32)
        }
    ])
    def call(self, inputs):
        outputs = self.clip_model(inputs)
        return {"logits_per_image": outputs.logits_per_image}

# Instantiate the serving model
serving_model = CLIPServingModel(model)

# Prepare a dummy input to save the model (matching the input signature)
dummy_inputs = {
    "input_ids": tf.zeros([1, 5], dtype=tf.int32),
    "attention_mask": tf.ones([1, 5], dtype=tf.int32),
    "pixel_values": tf.zeros([1, 3, 224, 224], dtype=tf.float32)
}

# Save the serving model as a SavedModel
tf.saved_model.save(serving_model, "./saved_model")

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.




















INFO:tensorflow:Assets written to: ./saved_model\assets


INFO:tensorflow:Assets written to: ./saved_model\assets
