From 7ee957205dd4df381f6eb2f1d33a20d1d54bce38 Mon Sep 17 00:00:00 2001
From: hugehope <166521727+hugehope@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:49:36 +0800
Subject: [PATCH] chore: remove repetitive words (#30174)

Signed-off-by: hugehope <cmm7@sina.cn>
---
 src/transformers/models/canine/modeling_canine.py    | 2 +-
 src/transformers/models/mamba/configuration_mamba.py | 2 +-
 src/transformers/models/rwkv/configuration_rwkv.py   | 2 +-
 src/transformers/optimization.py                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 023287153afc3..39d89c6e0b3da 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -608,7 +608,7 @@ def forward(
                 chunk_end = min(from_seq_length, chunk_start + self.attend_from_chunk_width)
                 from_chunks.append((chunk_start, chunk_end))
 
-            # Determine the chunks (windows) that will will attend *to*.
+            # Determine the chunks (windows) that will attend *to*.
             to_chunks = []
             if self.first_position_attends_to_all:
                 to_chunks.append((0, to_seq_length))
diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py
index 695d9a62737dc..b3e9b4eb946b9 100644
--- a/src/transformers/models/mamba/configuration_mamba.py
+++ b/src/transformers/models/mamba/configuration_mamba.py
@@ -67,7 +67,7 @@ class MambaConfig(PretrainedConfig):
         residual_in_fp32 (`bool`, *optional*, defaults to `True`):
             Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
         time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
-            Rank of the the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+            Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
         time_step_scale (`float`, *optional*, defaults to 1.0):
             Scale used used to scale `dt_proj.bias`.
         time_step_min (`float`, *optional*, defaults to 0.001):
diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py
index a6abfc549e667..5e0598dad5c42 100644
--- a/src/transformers/models/rwkv/configuration_rwkv.py
+++ b/src/transformers/models/rwkv/configuration_rwkv.py
@@ -41,7 +41,7 @@ class RwkvConfig(PretrainedConfig):
             Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`RwkvModel`].
         context_length (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
+            The maximum sequence length that this model can be used with in a single forward (using it in RNN mode
             lets use any sequence length).
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimensionality of the embeddings and hidden states.
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index ce9f9b78dcebe..3727784fba9ee 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -273,7 +273,7 @@ def get_polynomial_decay_schedule_with_warmup(
 
     lr_init = optimizer.defaults["lr"]
     if not (lr_init > lr_end):
-        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+        raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})")
 
     lr_lambda = partial(
         _get_polynomial_decay_schedule_with_warmup_lr_lambda,