diff --git a/scripts/extra/clip.py b/scripts/extra/clip.py index 06d1c04af..60dc88b9d 100644 --- a/scripts/extra/clip.py +++ b/scripts/extra/clip.py @@ -8,6 +8,13 @@ class CLIPVisionOnnxConfig(ViTOnnxConfig): pass +class CLIPVisionModelOnnxConfig(CLIPVisionOnnxConfig): + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + outputs = {"pooler_output": {0: "batch_size"},} + for i in range(self._normalized_config.num_hidden_layers + 1): + outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"} + return outputs class CLIPTextModelWithProjectionOnnxConfig(CLIPTextOnnxConfig): @property diff --git a/src/models.js b/src/models.js index a8112912e..322789e27 100644 --- a/src/models.js +++ b/src/models.js @@ -3169,6 +3169,15 @@ export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { return super.from_pretrained(pretrained_model_name_or_path, options); } } + +export class CLIPVisionModel extends CLIPPreTrainedModel { + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= 'vision_model'; + return super.from_pretrained(pretrained_model_name_or_path, options); + } +} //////////////////////////////////////////////////