In [None]:

from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

In [None]:
import transformers
from transformers import PreTrainedModel,CLIPPreTrainedModel,CLIPTextModel
from transformers import BertModel,CLIPTextConfig,XLMRobertaTokenizer,XLMRobertaModel
from transformers import RobertaPreTrainedModel,RobertaConfig
import torch 
from typing import Optional,List,Union,Tuple
class KDmodel(RobertaPreTrainedModel):
    def __init__(self,config):
        super().__init__(config)
        self.student = XLMRobertaModel.from_pretrained("xlm-roberta-large")
        self.teacher = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        self.pooling = 'average'
        self.loss_fn = 'mse'
        self.freeze()
        
    def freeze(self):
        for n,m in self.teacher.named_parameters():
            m.requires_grad_(False)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        teacher_input_ids = None,
        teacher_attention_mask = None,
    ) :
        student_hidden_states = self.student(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            inputs_embeds,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_values,
            use_cache,
            output_attentions,
            return_dict=return_dict,
            output_hidden_states=True,
        )[3]

        teacher_hidden_states = self.teacher(
            teacher_input_ids,
            teacher_attention_mask,
            output_hidden_states = True
        )[2]
        
        # pooling
        if self.pooling=='average':
            teacher_hidden_states = teacher_hidden_states[:-1].mean(-2)
            student_hidden_states = student_hidden_states[:-1].mean(-2)
        elif self.pooling=='single':
            teacher_hidden_states = teacher_hidden_states
            student_hidden_states = student_hidden_states

        # loss 
        if self.loss_fn=='mse':
            loss_fn = torch.nn.MSELoss()
        elif self.loss_fn=='cosine':     
            loss_fn = torch.nn.CosineEmbeddingLoss()
        elif self.loss_fn=='logits':     
            loss_fn = torch.nn.CrossEntropyLoss()
            
        loss = loss_fn(teacher_hidden_states,student_hidden_states)
        
        return {
            'loss':loss,
        }

In [1]:
# preprocess
sample = 'a lonely puppy.'
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
student_input = tokenizer([sample,])

teacher_input = processor([sample,])
teacher_input['teacher_input_ids'] = teacher_input['input_ids']
teacher_input['teacher_attention_mask'] = teacher_input['attention_mask']
del teacher_input['input_ids']
del teacher_input['attention_mask']

inputs = {**student_input,**teacher_input}
inputs

config = RobertaConfig.from_pretrained("xlm-roberta-large")
model = KDmodel(config)
model(**inputs)


NameError: name 'XLMRobertaTokenizer' is not defined