diff --git a/docs/source/en/model_doc/metaclip_2.md b/docs/source/en/model_doc/metaclip_2.md index 62d14fa66e18..b9fbba090f0a 100644 --- a/docs/source/en/model_doc/metaclip_2.md +++ b/docs/source/en/model_doc/metaclip_2.md @@ -32,7 +32,7 @@ MetaCLIP 2 is a replication of the original CLIP model trained on 300+ languages This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/MetaCLIP). -You can find all the MetaCLIP 2 checkpoints under the [Meta](https://huggingface.co/facebook?search_models=metaclip-2) organization. +You can find all the MetaCLIP 2 checkpoints under the [Meta](https://huggingface.co/facebook/models?search=metaclip-2) organization. > [!TIP] > Click on the MetaCLIP 2 models in the right sidebar for more examples of how to apply MetaCLIP 2 to different image and language tasks. diff --git a/src/transformers/models/metaclip_2/configuration_metaclip_2.py b/src/transformers/models/metaclip_2/configuration_metaclip_2.py index 32b21a193e8d..a0cec0f3c5b3 100644 --- a/src/transformers/models/metaclip_2/configuration_metaclip_2.py +++ b/src/transformers/models/metaclip_2/configuration_metaclip_2.py @@ -14,18 +14,18 @@ class MetaClip2TextConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`MetaClip2TextModel`]. It is used to instantiate a METACLIP_2 - text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the text encoder of the METACLIP_2 - [openai/metaclip_2-vit-base-patch32](https://huggingface.co/openai/metaclip_2-vit-base-patch32) architecture. + This is the configuration class to store the configuration of a [`MetaClip2TextModel`]. It is used to instantiate + a MetaClip2 text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 49408): - Vocabulary size of the METACLIP_2 text model. Defines the number of different tokens that can be represented by - the `inputs_ids` passed when calling [`MetaClip2Model`]. + Vocabulary size of the MetaClip2 text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`MetaClip2TextModel`]. hidden_size (`int`, *optional*, defaults to 512): Dimensionality of the encoder layers and the pooler layer. intermediate_size (`int`, *optional*, defaults to 2048): @@ -63,10 +63,10 @@ class MetaClip2TextConfig(PretrainedConfig): ```python >>> from transformers import MetaClip2TextConfig, MetaClip2TextModel - >>> # Initializing a MetaClip2TextConfig with openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2TextConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> configuration = MetaClip2TextConfig() - >>> # Initializing a MetaClip2TextModel (with random weights) from the openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2TextModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> model = MetaClip2TextModel(configuration) >>> # Accessing the model configuration @@ -115,10 +115,10 @@ def __init__( class MetaClip2VisionConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`MetaClip2VisionModel`]. It is used to instantiate a - METACLIP_2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the METACLIP_2 - [openai/metaclip_2-vit-base-patch32](https://huggingface.co/openai/metaclip_2-vit-base-patch32) architecture. + This is the configuration class to store the configuration of a [`MetaClip2VisionModel`]. It is used to instantiate a MetaClip2 + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the vision encoder of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -158,10 +158,10 @@ class MetaClip2VisionConfig(PretrainedConfig): ```python >>> from transformers import MetaClip2VisionConfig, MetaClip2VisionModel - >>> # Initializing a MetaClip2VisionConfig with openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2VisionConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> configuration = MetaClip2VisionConfig() - >>> # Initializing a MetaClip2VisionModel (with random weights) from the openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2VisionModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> model = MetaClip2VisionModel(configuration) >>> # Accessing the model configuration @@ -207,10 +207,10 @@ def __init__( class MetaClip2Config(PretrainedConfig): r""" - [`MetaClip2Config`] is the configuration class to store the configuration of a [`MetaClip2Model`]. It is used to instantiate - a METACLIP_2 model according to the specified arguments, defining the text model and vision model configs. Instantiating - a configuration with the defaults will yield a similar configuration to that of the METACLIP_2 - [openai/metaclip_2-vit-base-patch32](https://huggingface.co/openai/metaclip_2-vit-base-patch32) architecture. + [`MetaClip2Config`] is the configuration class to store the configuration of a [`MetaClip2Model`]. It is used to + instantiate a MetaClip2 model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -223,7 +223,7 @@ class MetaClip2Config(PretrainedConfig): projection_dim (`int`, *optional*, defaults to 512): Dimensionality of text and vision projection layers. logit_scale_init_value (`float`, *optional*, defaults to 2.6592): - The initial value of the *logit_scale* parameter. Default is used as per the original METACLIP_2 implementation. + The initial value of the *logit_scale* parameter. Default is used as per the original MetaClip2 implementation. kwargs (*optional*): Dictionary of keyword arguments. @@ -232,10 +232,10 @@ class MetaClip2Config(PretrainedConfig): ```python >>> from transformers import MetaClip2Config, MetaClip2Model - >>> # Initializing a MetaClip2Config with openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2Config with facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> configuration = MetaClip2Config() - >>> # Initializing a MetaClip2Model (with random weights) from the openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2Model (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> model = MetaClip2Model(configuration) >>> # Accessing the model configuration diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index 56ab43c03010..e326cc954987 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -542,6 +542,36 @@ def forward( """ ) class MetaClip2TextModel(MetaClip2PreTrainedModel): + """ + The text model from MetaClip2 without any head or projection on top. + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModel + + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + config: MetaClip2TextConfig _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] @@ -575,8 +605,8 @@ def forward( ```python >>> from transformers import AutoTokenizer, MetaClip2TextModel - >>> model = MetaClip2TextModel.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> tokenizer = AutoTokenizer.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") @@ -614,6 +644,36 @@ class MetaClip2TextModelOutput(ModelOutput): @auto_docstring class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel): + """ + MetaClip2 text model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection + + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + config: MetaClip2TextConfig _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] @@ -651,8 +711,8 @@ def forward( ```python >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection - >>> model = MetaClip2TextModelWithProjection.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> tokenizer = AutoTokenizer.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") @@ -740,6 +800,42 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: @auto_docstring class MetaClip2Model(MetaClip2PreTrainedModel): + """ + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + config: MetaClip2Config _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"] _supports_flash_attn = False # mask creation only accounts for sdpa/eager @@ -798,8 +894,8 @@ def get_text_features( ```python >>> from transformers import AutoTokenizer, MetaClip2Model - >>> model = MetaClip2Model.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> tokenizer = AutoTokenizer.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) @@ -843,8 +939,8 @@ def get_image_features( >>> import requests >>> from transformers import AutoProcessor, MetaClip2Model - >>> model = MetaClip2Model.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -885,8 +981,9 @@ def forward( interpolate_pos_encoding: bool = False, ) -> MetaClip2Output: r""" - return_loss (`bool`, *optional*): - Whether or not to return the contrastive loss. + Args: + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. Examples: @@ -895,8 +992,8 @@ def forward( >>> import requests >>> from transformers import AutoProcessor, MetaClip2Model - >>> model = MetaClip2Model.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -1015,6 +1112,42 @@ def forward( """ ) class MetaClip2VisionModel(MetaClip2PreTrainedModel): + """ + The vision model from MetaClip2 without any head or projection on top. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModel + + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + config: MetaClip2VisionConfig main_input_name = "pixel_values" _no_split_modules = ["MetaClip2EncoderLayer"] @@ -1038,15 +1171,15 @@ def forward( interpolate_pos_encoding: bool = False, ) -> BaseModelOutputWithPooling: r""" - Example: + Examples: ```python >>> from PIL import Image >>> import requests >>> from transformers import AutoProcessor, MetaClip2VisionModel - >>> model = MetaClip2VisionModel.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -1086,6 +1219,41 @@ class MetaClip2VisionModelOutput(ModelOutput): @auto_docstring class MetaClip2VisionModelWithProjection(MetaClip2PreTrainedModel): + """ + MetaClip2 vision model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection + + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + config: MetaClip2VisionConfig main_input_name = "pixel_values" @@ -1120,8 +1288,8 @@ def forward( >>> import requests >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection - >>> model = MetaClip2VisionModelWithProjection.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index d4c259849e69..fb80b06c9ac9 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -28,15 +28,178 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "facebook/metaclip-2-worldwide-huge-quickgelu" +_CONFIG_FOR_DOC = "MetaClip2Config" + + class MetaClip2TextConfig(CLIPTextConfig): + r""" + This is the configuration class to store the configuration of a [`MetaClip2TextModel`]. It is used to instantiate + a MetaClip2 text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 49408): + Vocabulary size of the MetaClip2 text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`MetaClip2TextModel`]. + hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 77): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 49406): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 49407): + End of stream token id. + + Example: + + ```python + >>> from transformers import MetaClip2TextConfig, MetaClip2TextModel + + >>> # Initializing a MetaClip2TextConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> configuration = MetaClip2TextConfig() + + >>> # Initializing a MetaClip2TextModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> model = MetaClip2TextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + pass class MetaClip2VisionConfig(CLIPVisionConfig): + r""" + This is the configuration class to store the configuration of a [`MetaClip2VisionModel`]. It is used to instantiate a MetaClip2 + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the vision encoder of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import MetaClip2VisionConfig, MetaClip2VisionModel + + >>> # Initializing a MetaClip2VisionConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> configuration = MetaClip2VisionConfig() + + >>> # Initializing a MetaClip2VisionModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> model = MetaClip2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + pass class MetaClip2Config(CLIPConfig): + r""" + [`MetaClip2Config`] is the configuration class to store the configuration of a [`MetaClip2Model`]. It is used to + instantiate a MetaClip2 model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MetaClip2TextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MetaClip2VisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original MetaClip2 implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import MetaClip2Config, MetaClip2Model + + >>> # Initializing a MetaClip2Config with facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> configuration = MetaClip2Config() + + >>> # Initializing a MetaClip2Model (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> model = MetaClip2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a MetaClip2Config from a MetaClip2TextConfig and a MetaClip2VisionConfig + >>> from transformers import MetaClip2TextConfig, MetaClip2VisionConfig + + >>> # Initializing a MetaClip2Text and MetaClip2Vision configuration + >>> config_text = MetaClip2TextConfig() + >>> config_vision = MetaClip2VisionConfig() + + >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) + ```""" + pass @@ -175,14 +338,105 @@ def forward( class MetaClip2TextModel(CLIPTextModel): + """ + The text model from MetaClip2 without any head or projection on top. + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModel + + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + def __init__(self, config: MetaClip2TextConfig): super().__init__(config) self.text_model = MetaClip2TextTransformer(config) # Initialize weights and apply final processing self.post_init() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + r""" + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModel + + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection): + """ + MetaClip2 text model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection + + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + def __init__(self, config: MetaClip2TextConfig): super().__init__(config) @@ -194,8 +448,74 @@ def __init__(self, config: MetaClip2TextConfig): # Initialize weights and apply final processing self.post_init() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + r""" + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection + + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + class MetaClip2Model(CLIPModel): + """ + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + def __init__(self, config: MetaClip2Config): super().__init__(config) @@ -219,13 +539,263 @@ def __init__(self, config: MetaClip2Config): # Initialize weights and apply final processing self.post_init() + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Args: + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + return super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`MetaClip2TextModel`]. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + return super().get_text_features( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`MetaClip2VisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + return super().get_image_features( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + class MetaClip2VisionModel(CLIPVisionModel): - pass + """ + The vision model from MetaClip2 without any head or projection on top. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModel + + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModel + + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + return super().forward( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) class MetaClip2VisionModelWithProjection(CLIPVisionModelWithProjection): - pass + """ + MetaClip2 vision model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection + + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection + + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + return super().forward( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) class MetaClip2ForImageClassification(CLIPForImageClassification):