From 77401c86350f5704ed2607983fc970a6767e27f7 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:41:05 +0200
Subject: [PATCH 1/4] fix

---
 docker/transformers-all-latest-gpu/Dockerfile |  3 ++
 docs/source/en/model_doc/code_llama.md        |  1 +
 docs/source/en/pipeline_tutorial.md           |  6 ++-
 docs/source/en/task_summary.md                |  2 +-
 src/transformers/models/clap/modeling_clap.py |  6 +--
 .../models/encodec/modeling_encodec.py        |  2 +-
 .../grounding_dino/modeling_grounding_dino.py |  8 ++--
 .../models/siglip/modeling_siglip.py          | 38 ++++++++++++++-----
 utils/not_doctested.txt                       |  1 +
 utils/tests_fetcher.py                        |  2 +
 10 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 682ec1355eaa..4f596c3c1cf9 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -52,6 +52,9 @@ RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https:
 # For `nougat` tokenizer
 RUN python3 -m pip install --no-cache-dir python-Levenshtein
 
+# For `FastSpeech2ConformerTokenizer` tokenizer
+RUN python3 -m pip install --no-cache-dir g2p-en
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
index 6906cf431214..cd32a38f5a6a 100644
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@@ -97,6 +97,7 @@ If you only want the infilled part:
 
 >>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
 >>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128)
+[{'generated_text': 'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return resultRemove non-ASCII characters from a string. """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c'}]
 ```
 
 Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.  To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 42ea3b1d5fbc..8518f639ab9d 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -270,11 +270,13 @@ For example, if you use this [invoice image](https://huggingface.co/spaces/impir
 >>> from transformers import pipeline
 
 >>> vqa = pipeline(model="impira/layoutlm-document-qa")
->>> vqa(
+>>> output = vqa(
 ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
 ...     question="What is the invoice number?",
 ... )
-[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
 ```
 
 <Tip>
diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md
index 8f7eb041f1f2..a5e2192f8759 100644
--- a/docs/source/en/task_summary.md
+++ b/docs/source/en/task_summary.md
@@ -326,7 +326,7 @@ Document question answering is a task that answers natural language questions fr
 >>> from PIL import Image
 >>> import requests
 
->>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index b2c0df4866b1..7b20b30137d2 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1719,7 +1719,7 @@ def forward(
         >>> from datasets import load_dataset
         >>> from transformers import AutoProcessor, ClapAudioModel
 
-        >>> dataset = load_dataset("ashraq/esc50")
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
         >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
@@ -2067,7 +2067,7 @@ def forward(
         >>> from datasets import load_dataset
         >>> from transformers import AutoProcessor, ClapModel
 
-        >>> dataset = load_dataset("ashraq/esc50")
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
         >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
@@ -2260,7 +2260,7 @@ def forward(
         >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
         >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
 
-        >>> dataset = load_dataset("ashraq/esc50")
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
         >>> inputs = processor(audios=audio_sample, return_tensors="pt")
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index 5a299b601b47..48498b741d18 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -776,7 +776,7 @@ def forward(
         >>> from datasets import load_dataset
         >>> from transformers import AutoProcessor, EncodecModel
 
-        >>> dataset = load_dataset("ashraq/esc50")
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         >>> audio_sample = dataset["train"]["audio"][0]["array"]
 
         >>> model_id = "facebook/encodec_24khz"
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 7f9149de9155..a98901015c94 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2996,10 +2996,10 @@ def forward(
         ...     outputs, threshold=0.35, target_sizes=target_sizes
         ... )[0]
         >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        ...     box = [round(i, 2) for i in box.tolist()]
-        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}")
-        Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83]
-        Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89]
+        ...     box = [round(i, 1) for i in box.tolist()]
+        ...     print(f"Detected {label.item()} with confidence " f"{round(score.item(), 2)} at location {box}")
+        Detected 1 with confidence 0.45 at location [344.8, 23.2, 637.4, 373.8]
+        Detected 1 with confidence 0.41 at location [11.9, 51.6, 316.6, 472.9]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 6e225803b4a0..dc207846d818 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -48,10 +48,6 @@
 _CONFIG_FOR_DOC = "SiglipConfig"
 _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
 
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/siglip-base-patch16-224"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_1"
-
 
 from ..deprecated._archive_maps import SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST  # noqa: F401, E402
 
@@ -1218,12 +1214,7 @@ def __init__(self, config: SiglipConfig) -> None:
         self.post_init()
 
     @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
-    )
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
@@ -1237,6 +1228,33 @@ def forward(
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # note: we are loading a `SiglipModel` from the hub here,
+        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
+        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the two classes
+        >>> predicted_class_idx = logits.argmax(-1).item()
+        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+        Predicted class: LABEL_0
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index 4ac104ee2e42..54924759cafd 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -165,6 +165,7 @@ docs/source/en/model_doc/megatron-bert.md
 docs/source/en/model_doc/megatron_gpt2.md
 docs/source/en/model_doc/mgp-str.md
 docs/source/en/model_doc/mistral.md
+docs/source/en/model_doc/mixtral.md
 docs/source/en/model_doc/mluke.md
 docs/source/en/model_doc/mms.md
 docs/source/en/model_doc/mobilebert.md
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index a7a24d66596d..60a1c8f53c1f 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -506,6 +506,8 @@ def get_all_doctest_files() -> List[str]:
     test_files_to_run = py_files + md_files
     # change to use "/" as path separator
     test_files_to_run = ["/".join(Path(x).parts) for x in test_files_to_run]
+    # don't run doctest for files in `src/transformers/models/deprecated`
+    test_files_to_run = [x for x in test_files_to_run if "models/deprecated" not in test_files_to_run]
 
     # only include files in `src` or `docs/source/en/`
     test_files_to_run = [x for x in test_files_to_run if x.startswith(("src/", "docs/source/en/"))]

From 3c6f046b588d8393b0b52c7bd73df9deeebfaf1c Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:53:47 +0200
Subject: [PATCH 2/4] update

---
 src/transformers/models/siglip/modeling_siglip.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index dc207846d818..b963afa1f505 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -33,7 +33,6 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
-    add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,

From 3d8068a5bc11c67c65b0297642f210bc3b1e2674 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:01:01 +0200
Subject: [PATCH 3/4] update

---
 docs/source/es/task_summary.md         | 2 +-
 docs/source/hi/pipeline_tutorial.md    | 6 ++++--
 docs/source/ja/model_doc/code_llama.md | 1 +
 docs/source/ja/pipeline_tutorial.md    | 6 ++++--
 docs/source/ja/task_summary.md         | 2 +-
 docs/source/zh/pipeline_tutorial.md    | 6 ++++--
 docs/source/zh/task_summary.md         | 2 +-
 7 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/docs/source/es/task_summary.md b/docs/source/es/task_summary.md
index 3c24f0dad14f..639654c3697a 100644
--- a/docs/source/es/task_summary.md
+++ b/docs/source/es/task_summary.md
@@ -325,7 +325,7 @@ Las respuestas a preguntas de documentos es una tarea que responde preguntas en
 >>> from PIL import Image
 >>> import requests
 
->>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
diff --git a/docs/source/hi/pipeline_tutorial.md b/docs/source/hi/pipeline_tutorial.md
index 5f3cd680480d..d20d5d617a97 100644
--- a/docs/source/hi/pipeline_tutorial.md
+++ b/docs/source/hi/pipeline_tutorial.md
@@ -270,11 +270,13 @@ NLP कार्यों के लिए [`pipeline`] का उपयोग 
 >>> from transformers import pipeline
 
 >>> vqa = pipeline(model="impira/layoutlm-document-qa")
->>> vqa(
+>>> output = vqa(
 ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
 ...     question="What is the invoice number?",
 ... )
-[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
 ```
 
 <Tip>
diff --git a/docs/source/ja/model_doc/code_llama.md b/docs/source/ja/model_doc/code_llama.md
index dcbcf9bd93d3..5f6e4e43b45d 100644
--- a/docs/source/ja/model_doc/code_llama.md
+++ b/docs/source/ja/model_doc/code_llama.md
@@ -95,6 +95,7 @@ def remove_non_ascii(s: str) -> str:
 
 >>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
 >>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128)
+[{'generated_text': 'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return resultRemove non-ASCII characters from a string. """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c'}]
 ```
 
 内部では、トークナイザーが [`<FILL_ME>` によって自動的に分割](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) して、[ に続く書式設定された入力文字列を作成します。オリジナルのトレーニング パターン](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402)。これは、パターンを自分で準備するよりも堅牢です。トークンの接着など、デバッグが非常に難しい落とし穴を回避できます。このモデルまたは他のモデルに必要な CPU および GPU メモリの量を確認するには、その値を決定するのに役立つ [この計算ツール](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) を試してください。
diff --git a/docs/source/ja/pipeline_tutorial.md b/docs/source/ja/pipeline_tutorial.md
index 354e2a2be380..5dbda5ce4d4a 100644
--- a/docs/source/ja/pipeline_tutorial.md
+++ b/docs/source/ja/pipeline_tutorial.md
@@ -246,11 +246,13 @@ for out in pipe(KeyDataset(dataset, "audio")):
 >>> from transformers import pipeline
 
 >>> vqa = pipeline(model="impira/layoutlm-document-qa")
->>> vqa(
+>>> output = vqa(
 ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
 ...     question="What is the invoice number?",
 ... )
-[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
 ```
 
 <Tip>
diff --git a/docs/source/ja/task_summary.md b/docs/source/ja/task_summary.md
index 0069f6afaf32..93f4783b1520 100644
--- a/docs/source/ja/task_summary.md
+++ b/docs/source/ja/task_summary.md
@@ -340,7 +340,7 @@ score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
 >>> from PIL import Image
 >>> import requests
 
->>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
diff --git a/docs/source/zh/pipeline_tutorial.md b/docs/source/zh/pipeline_tutorial.md
index 568f8bb63603..ab2136022913 100644
--- a/docs/source/zh/pipeline_tutorial.md
+++ b/docs/source/zh/pipeline_tutorial.md
@@ -257,11 +257,13 @@ for out in pipe(KeyDataset(dataset, "audio")):
 >>> from transformers import pipeline
 
 >>> vqa = pipeline(model="impira/layoutlm-document-qa")
->>> vqa(
+>>> output = vqa(
 ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
 ...     question="What is the invoice number?",
 ... )
-[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
 ```
 
 <Tip>
diff --git a/docs/source/zh/task_summary.md b/docs/source/zh/task_summary.md
index 8d088bfa71b2..8a6a6a51ead9 100644
--- a/docs/source/zh/task_summary.md
+++ b/docs/source/zh/task_summary.md
@@ -332,7 +332,7 @@ score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
 >>> from PIL import Image
 >>> import requests
 
->>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")

From c70edf19bbfb76e17983acbf3ef753aa041e5d1a Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:11:16 +0200
Subject: [PATCH 4/4] fix

---
 src/transformers/models/siglip/modeling_siglip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index b963afa1f505..cf83e8a39ebb 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -1254,7 +1254,7 @@ def forward(
         >>> predicted_class_idx = logits.argmax(-1).item()
         >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
         Predicted class: LABEL_0
-        """
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states