From 4410ee6cc01ffce53b0a56339ebe6fc5fcd7b0fa Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Thu, 16 Mar 2023 20:25:18 -0700 Subject: [PATCH] Fix typos --- src/diffusers/models/attention.py | 2 +- src/diffusers/models/controlnet.py | 16 +++++++-------- src/diffusers/models/modeling_utils.py | 4 ++-- src/diffusers/models/resnet.py | 2 +- src/diffusers/models/transformer_2d.py | 6 +++--- src/diffusers/models/unet_1d.py | 2 +- src/diffusers/models/unet_1d_blocks.py | 2 +- src/diffusers/models/unet_2d_blocks.py | 2 +- src/diffusers/models/unet_2d_condition.py | 20 +++++++++---------- src/diffusers/pipelines/pipeline_utils.py | 2 +- .../versatile_diffusion/modeling_text_unet.py | 20 +++++++++---------- tests/models/test_models_unet_2d_condition.py | 10 +++++----- 12 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index aa10bdd0e952..5c7e54e7cd32 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -69,7 +69,7 @@ def __init__( self.value = nn.Linear(channels, channels) self.rescale_output_factor = rescale_output_factor - self.proj_attn = nn.Linear(channels, channels, 1) + self.proj_attn = nn.Linear(channels, channels, bias=True) self._use_memory_efficient_attention_xformers = False self._attention_op = None diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 0d59605fe046..ac6e64e4c779 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -344,7 +344,7 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.: + In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: """ count = len(self.attn_processors.keys()) @@ -379,24 +379,24 @@ def set_attention_slice(self, slice_size): Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ sliceable_head_dims = [] - def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): if hasattr(module, "set_attention_slice"): sliceable_head_dims.append(module.sliceable_head_dim) for child in module.children(): - fn_recursive_retrieve_slicable_dims(child) + fn_recursive_retrieve_sliceable_dims(child) # retrieve number of attention layers for module in self.children(): - fn_recursive_retrieve_slicable_dims(module) + fn_recursive_retrieve_sliceable_dims(module) - num_slicable_layers = len(sliceable_head_dims) + num_sliceable_layers = len(sliceable_head_dims) if slice_size == "auto": # half the attention head size is usually a good trade-off between @@ -404,9 +404,9 @@ def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): slice_size = [dim // 2 for dim in sliceable_head_dims] elif slice_size == "max": # make smallest slice possible - slice_size = num_slicable_layers * [1] + slice_size = num_sliceable_layers * [1] - slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index a7c2c750f654..e51b40ce4509 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -575,7 +575,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P raise ValueError( f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are" f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass" - " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomely initialize" + " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize" " those weights or else make sure your checkpoint file is correct." ) @@ -591,7 +591,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P set_module_tensor_to_device(model, param_name, param_device, value=param) else: # else let accelerate handle loading and dispatching. # Load weights and dispatch according to the device_map - # by deafult the device_map is None and the weights are loaded on the CPU + # by default the device_map is None and the weights are loaded on the CPU accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype) loading_info = { diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index 7c14a7c4832d..d159115d7ee3 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -418,7 +418,7 @@ class ResnetBlock2D(nn.Module): time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config. By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or "ada_group" for a stronger conditioning with scale and shift. - kernal (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py index 2515c54bc227..23364bfa1d16 100644 --- a/src/diffusers/models/transformer_2d.py +++ b/src/diffusers/models/transformer_2d.py @@ -105,7 +105,7 @@ def __init__( self.attention_head_dim = attention_head_dim inner_dim = num_attention_heads * attention_head_dim - # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` + # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` # Define whether input is continuous or discrete depending on configuration self.is_input_continuous = (in_channels is not None) and (patch_size is None) self.is_input_vectorized = num_vector_embeds is not None @@ -198,7 +198,7 @@ def __init__( # 4. Define output layers self.out_channels = in_channels if out_channels is None else out_channels if self.is_input_continuous: - # TODO: should use out_channels for continous projections + # TODO: should use out_channels for continuous projections if use_linear_projection: self.proj_out = nn.Linear(inner_dim, in_channels) else: @@ -223,7 +223,7 @@ def forward( """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input + When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input hidden_states encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py index eada6ddf3a1c..5062295fc668 100644 --- a/src/diffusers/models/unet_1d.py +++ b/src/diffusers/models/unet_1d.py @@ -59,7 +59,7 @@ class UNet1DModel(ModelMixin, ConfigMixin): obj:`(32, 32, 64)`): Tuple of block output channels. mid_block_type (`str`, *optional*, defaults to "UNetMidBlock1D"): block type for middle of UNet. out_block_type (`str`, *optional*, defaults to `None`): optional output processing of UNet. - act_fn (`str`, *optional*, defaults to None): optional activitation function in UNet blocks. + act_fn (`str`, *optional*, defaults to None): optional activation function in UNet blocks. norm_num_groups (`int`, *optional*, defaults to 8): group norm member count in UNet blocks. layers_per_block (`int`, *optional*, defaults to 1): added number of layers in a UNet block. downsample_each_block (`int`, *optional*, defaults to False: diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py index a30f1f8e002e..a0f0e58f9103 100644 --- a/src/diffusers/models/unet_1d_blocks.py +++ b/src/diffusers/models/unet_1d_blocks.py @@ -331,7 +331,7 @@ def __init__(self, in_channels, n_head=1, dropout_rate=0.0): self.key = nn.Linear(self.channels, self.channels) self.value = nn.Linear(self.channels, self.channels) - self.proj_attn = nn.Linear(self.channels, self.channels, 1) + self.proj_attn = nn.Linear(self.channels, self.channels, bias=True) self.dropout = nn.Dropout(dropout_rate, inplace=True) diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index f865b42eb9d5..3070351279b8 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -2684,7 +2684,7 @@ def __init__( dropout=dropout, bias=attention_bias, cross_attention_dim=None, - cross_attention_norm=None, + cross_attention_norm=False, ) # 2. Cross-Attn diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 8cd3dcf42307..79a361763c76 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -197,7 +197,7 @@ def __init__( timestep_input_dim = block_out_channels[0] else: raise ValueError( - f"{time_embedding_type} does not exist. Pleaes make sure to use one of `fourier` or `positional`." + f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." ) self.time_embedding = TimestepEmbedding( @@ -391,7 +391,7 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.: + In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: """ count = len(self.attn_processors.keys()) @@ -425,24 +425,24 @@ def set_attention_slice(self, slice_size): Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ sliceable_head_dims = [] - def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): if hasattr(module, "set_attention_slice"): sliceable_head_dims.append(module.sliceable_head_dim) for child in module.children(): - fn_recursive_retrieve_slicable_dims(child) + fn_recursive_retrieve_sliceable_dims(child) # retrieve number of attention layers for module in self.children(): - fn_recursive_retrieve_slicable_dims(module) + fn_recursive_retrieve_sliceable_dims(module) - num_slicable_layers = len(sliceable_head_dims) + num_sliceable_layers = len(sliceable_head_dims) if slice_size == "auto": # half the attention head size is usually a good trade-off between @@ -450,9 +450,9 @@ def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): slice_size = [dim // 2 for dim in sliceable_head_dims] elif slice_size == "max": # make smallest slice possible - slice_size = num_slicable_layers * [1] + slice_size = num_sliceable_layers * [1] - slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( @@ -515,7 +515,7 @@ def forward( returning a tuple, the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. - # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). # However, the upsampling interpolation output size can be forced to fit any upsampling size # on the fly if necessary. default_overall_up_factor = 2**self.num_upsamplers diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 6560f305c18e..6ebed5e1df76 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1351,7 +1351,7 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto Args: slice_size (`str` or `int`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 7b021c597d10..dd5410dbc0b0 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -287,7 +287,7 @@ def __init__( timestep_input_dim = block_out_channels[0] else: raise ValueError( - f"{time_embedding_type} does not exist. Pleaes make sure to use one of `fourier` or `positional`." + f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." ) self.time_embedding = TimestepEmbedding( @@ -481,7 +481,7 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.: + In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: """ count = len(self.attn_processors.keys()) @@ -515,24 +515,24 @@ def set_attention_slice(self, slice_size): Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` must be a multiple of `slice_size`. """ sliceable_head_dims = [] - def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): if hasattr(module, "set_attention_slice"): sliceable_head_dims.append(module.sliceable_head_dim) for child in module.children(): - fn_recursive_retrieve_slicable_dims(child) + fn_recursive_retrieve_sliceable_dims(child) # retrieve number of attention layers for module in self.children(): - fn_recursive_retrieve_slicable_dims(module) + fn_recursive_retrieve_sliceable_dims(module) - num_slicable_layers = len(sliceable_head_dims) + num_sliceable_layers = len(sliceable_head_dims) if slice_size == "auto": # half the attention head size is usually a good trade-off between @@ -540,9 +540,9 @@ def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): slice_size = [dim // 2 for dim in sliceable_head_dims] elif slice_size == "max": # make smallest slice possible - slice_size = num_slicable_layers * [1] + slice_size = num_sliceable_layers * [1] - slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( @@ -605,7 +605,7 @@ def forward( returning a tuple, the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. - # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). # However, the upsampling interpolation output size can be forced to fit any upsampling size # on the fly if necessary. default_overall_up_factor = 2**self.num_upsamplers diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index 5ca0bae66c02..ab6f12085e0f 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -223,23 +223,23 @@ def test_model_attention_slicing(self): output = model(**inputs_dict) assert output is not None - def test_model_slicable_head_dim(self): + def test_model_sliceable_head_dim(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() init_dict["attention_head_dim"] = (8, 16) model = self.model_class(**init_dict) - def check_slicable_dim_attr(module: torch.nn.Module): + def check_sliceable_dim_attr(module: torch.nn.Module): if hasattr(module, "set_attention_slice"): assert isinstance(module.sliceable_head_dim, int) for child in module.children(): - check_slicable_dim_attr(child) + check_sliceable_dim_attr(child) # retrieve number of attention layers for module in model.children(): - check_slicable_dim_attr(module) + check_sliceable_dim_attr(module) def test_special_attn_proc(self): class AttnEasyProc(torch.nn.Module): @@ -658,7 +658,7 @@ def test_set_attention_slice_list(self): torch.cuda.reset_max_memory_allocated() torch.cuda.reset_peak_memory_stats() - # there are 32 slicable layers + # there are 32 sliceable layers slice_list = 16 * [2, 3] unet = self.get_unet_model() unet.set_attention_slice(slice_list)