hpcaitech · yuanheng-zhao · May 21, 2024 · Apr 30, 2024 · May 1, 2024 · May 6, 2024
@@ -8,6 +8,13 @@ body:
   attributes:
     value: >
       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
+- type: checkboxes
+  attributes:
+    label: Is there an existing issue for this bug?
+    description: Please search [here](https://github.com/hpcaitech/ColossalAI/issues) to see if an open or closed issue already exists for the bug you have encountered.
+    options:
+    - label: I have searched the existing issues
+      required: true
 - type: textarea
   attributes:
     label: 🐛 Describe the bug

@@ -140,7 +140,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v -e .
+          pip install -v -e .
           pip install -r requirements/requirements-test.txt
 
       - name: Store Colossal-AI Cache

@@ -418,7 +418,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 ## Installation
 
 Requirements:
-- PyTorch >= 1.11 and PyTorch <= 2.1
+- PyTorch >= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

@@ -10,7 +10,7 @@
 import os
 from multiprocessing import cpu_count
 
-from colossal_llama.dataset.conversation import default_conversation
+from colossal_llama.dataset.conversation import LLaMA2_Conv
 from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
 from datasets import dataset_dict, load_dataset
 from transformers import AddedToken, AutoTokenizer
@@ -78,6 +78,7 @@ def main():
     # Fix </s> split issue: https://github.com/huggingface/transformers/issues/23833
     if args.llama_version == 2:
         tokenizer.add_tokens(AddedToken("</s>", normalized=False, special=True), special_tokens=True)
+        default_conversation = LLaMA2_Conv
 
     tokenizer.add_bos_token = False
     tokenizer.add_eos_token = False

@@ -1,7 +1,9 @@
 import ctypes
 import random
 import warnings
+from collections import defaultdict
 from contextlib import contextmanager
+from copy import deepcopy
 from functools import partial
 from types import MethodType
 from typing import Any, Callable, Dict, Iterator, List, Optional, OrderedDict, Tuple, Union
@@ -24,6 +26,8 @@
 from colossalai.checkpoint_io import CheckpointIO, HybridParallelCheckpointIO
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import AMPModelMixin, ModelWrapper, OptimizerWrapper
+from colossalai.interface.optimizer import DistributedOptim
+from colossalai.nn.optimizer import DistGaloreAwamW
 from colossalai.pipeline.schedule import InterleavedSchedule, OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import GradientCheckpointConfig, ShardConfig, ShardFormer
@@ -735,7 +739,7 @@ def _get_grads_to_sync(all_working_grads) -> Union[List[Tensor], None]:
         # Get all working gradients and gradients to be synchronized.
         all_working_grads = _get_all_working_grads()
         grads_to_sync = _get_grads_to_sync(all_working_grads)
-        if self.require_grad_sync and grads_to_sync is not None:
+        if self._grad_store.require_grad_sync and grads_to_sync is not None:
             # Synchronize sequence parallelism gradients if required.
             SeqParallelUtils.allreduce_partial_data_grad(process_group=self.tp_pg, grads=grads_to_sync)
         else:
@@ -759,7 +763,7 @@ def backward(self, loss, retain_graph=False):
         # Call the superclass backward method to compute gradients.
         super().backward(loss, retain_graph)
 
-        if self.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
+        if self._grad_store.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
             # If gradient synchronization is required, sync sequence parallelism gradients.
             self._sync_sp_grads()
         else:
@@ -784,7 +788,7 @@ def backward_by_grad(self, tensor, grad):
         # Call the superclass backward_by_grad method to compute gradients.
         super().backward_by_grad(tensor, grad)
 
-        if self.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
+        if self._grad_store.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
             # If gradient synchronization is required, sync sequence parallelism gradients.
             self._sync_sp_grads()
         else:
@@ -1171,6 +1175,15 @@ def configure(
         lr_scheduler: Optional[LRScheduler] = None,
     ) -> Tuple[Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]:
         param_info = get_param_info(optimizer)
+
+        # TODO: Support Galore + ZeRO
+        zero_stage = self.zero_stage
+        zero_config = deepcopy(self.zero_config)
+        if isinstance(optimizer, DistGaloreAwamW) and zero_stage > 0 and self.dp_size > 0:
+            warnings.warn("Galore is only supported for Tensor Parallel and vanilla Data Parallel yet. Disabling ZeRO.")
+            zero_config["partition_grad"] = False
+            zero_stage = 0
+
         if not isinstance(model, ModelWrapper):
             use_ddp = (self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0) or (
                 self.dp_size == 1
@@ -1194,7 +1207,8 @@ def configure(
                 custom_policy=self.custom_policy,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
-            if self.zero_stage == 0:
+            if zero_stage == 0:
+                is_zero = False
                 if self.precision in ["fp16", "bf16"]:
                     optimizer = HybridParallelAMPOptimizer(
                         optimizer,
@@ -1218,11 +1232,11 @@ def configure(
                         tp_process_group=self.tp_group,
                     )
             else:
-                zero_dp_size = dist.get_world_size(dp_group)
-                if zero_dp_size == 1:
+                is_zero = self.dp_size > 1
+                if self.dp_size == 1:
                     warnings.warn(
                         "Use Zero Optimizer when data parallel size is 1 may introduce unnecessary overhead. "
-                        "If you are not intended to use cpu_offload, please consider set zero_stage=0."
+                        "If you do not intend to use cpu_offload, please consider set zero_stage=0."
                     )
 
                 assert self.precision != "fp32", "Please set precision to 'fp16' or 'bf16' when using ZeRO."
@@ -1236,11 +1250,19 @@ def configure(
                     pp_process_group=self.pp_group,
                     verbose=True,
                     clip_grad_norm=self.max_norm,
-                    **self.zero_config,
+                    **zero_config,
                     **self.amp_config,
                 )
             # inject update_master_params
             model.update_master_params = MethodType(optimizer.update_master_params, model)
+
+            # Setup optimizers that require global states
+            optim = optimizer.optim
+            if isinstance(optim, DistributedOptim):
+                shard_to_param = optimizer.get_master_to_working_map() if is_zero else {}
+                padding_map = optimizer.get_param_padding_map() if is_zero else defaultdict(int)
+                optim.setup_distributed(self.tp_group, self.dp_group, shard_to_param, padding_map, is_zero)
+
         return model, optimizer, criterion, dataloader, lr_scheduler
 
     def execute_pipeline(
@@ -1272,7 +1294,7 @@ def execute_pipeline(
 
         # run with gradients accumulation
         if model.require_grad_sync == False or (
-            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False
+            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer._grad_store.require_grad_sync == False
         ):
             return outputs
 

@@ -8,7 +8,10 @@
 from typing import Callable, Dict, Iterator, List, Optional, Tuple
 
 import torch
+import torch.distributed
+import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed.distributed_c10d import _get_default_group
 from torch.nn import Parameter
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
@@ -28,6 +31,8 @@
     sharded_optimizer_loading_epilogue,
 )
 from colossalai.interface import AMPModelMixin, ModelWrapper, OptimizerWrapper
+from colossalai.interface.optimizer import DistributedOptim
+from colossalai.nn.optimizer import DistGaloreAwamW
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
 from colossalai.zero import LowLevelZeroOptimizer
 
@@ -428,13 +433,31 @@ def configure(
         if not isinstance(model, ModelWrapper):
             model = LowLevelZeroModel(model, self.precision)
 
+        # TODO: Support Galore + ZeRO
+        zero_stage = self.stage
+        zero_optim_kwargs = {**self.zero_optim_kwargs}
+        dp_size = dist.get_world_size()
+        if isinstance(optimizer, DistGaloreAwamW) and zero_stage > 0 and dp_size > 0:
+            warnings.warn("Galore is only supported for Tensor Parallel and vanilla Data Parallel yet. Disabling ZeRO.")
+            zero_optim_kwargs["partition_grad"] = False
+            zero_stage = 0
+
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             optimizer: LowLevelZeroOptimizer = LowLevelZeroOptimizer(
-                optimizer, **self.zero_optim_kwargs, verbose=self.verbose
+                optimizer, **zero_optim_kwargs, verbose=self.verbose
             )
             # inject update_master_params
             model.update_master_params = MethodType(optimizer.update_master_params, model)
 
+            # Setup optimizers that require global states
+            optim = optimizer.optim
+            is_zero = dp_size > 1 and zero_stage > 0
+            dp_group = _get_default_group()  # Use the whole world
+            if isinstance(optim, DistributedOptim):
+                shard_to_param = optimizer.get_master_to_working_map()
+                padding_map = optimizer.get_param_padding_map()
+                optim.setup_distributed(None, dp_group, shard_to_param, padding_map, is_zero)
+
         return model, optimizer, criterion, dataloader, lr_scheduler
 
     def control_checkpoint_io(self) -> bool:

@@ -38,7 +38,12 @@ class ProcessGroupMesh:
 
     def __init__(self, *size: int) -> None:
         assert dist.is_initialized(), "Please initialize torch.distributed first."
-        assert prod(size) == dist.get_world_size(), "The product of the size must be equal to the world size."
+        world_size = dist.get_world_size()
+        prod_size = prod(size)
+        assert (
+            prod_size == world_size
+        ), f"The product of the size({prod_size}) must be equal to the world size({world_size})."
+
         self._shape = size
         self._rank = dist.get_rank()
         self._coord = ProcessGroupMesh.unravel(self._rank, self._shape)

@@ -306,9 +306,8 @@ def _init_global_to_logical_rank_mapping(
             # index means the local rank in the current axis
             # inner_tensor refers to the processes with the same local rank
 
-            if inner_tensor.numel() == 1:
-                # if the inner_tensor only has one element, it means that
-                # it already reaches the last axis
+            if inner_tensor.dim() == 0:
+                # if the inner_tensor already reaches the last axis,
                 # we append its local_rank in the last axis to the index_list
                 # and assign to the mapping
                 # the value of the mapping is the the local rank at the indexed axis of the device mesh
@@ -459,6 +458,7 @@ def _collate_global_ranks_in_same_process_group(self, global_rank):
 
                 # replace the local rank in the given dimension with the
                 # local rank of the current process iterated
+
                 process_coordinates[dim] = _local_rank
                 processes_in_the_same_process_group[dim].append(process_coordinates)
 

@@ -1,6 +1,7 @@
-from typing import Union
+from typing import Dict, Optional, Union
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 from torch import Tensor
 from torch.optim import Optimizer
@@ -133,3 +134,25 @@ def unwrap(self):
         Unwrap the optimizer for checkpoint saving/loading.
         """
         return self.optim
+
+
+class DistributedOptim(Optimizer):
+    def setup_distributed(
+        self,
+        tp_group: Optional[dist.ProcessGroup] = None,
+        dp_group: Optional[dist.ProcessGroup] = None,
+        shard_to_working_param: Optional[Dict] = {},
+        padding_map: Optional[Dict] = None,
+        is_zero: Optional[bool] = False,
+    ):
+        """Assign process groups for TP and ZeRO 2.
+        Arguments:
+            tp_group (dist.ProcessGroup): Tensor Parallel process group
+            dp_group (dist.ProcessGroup): ZeRO stage 2 process group
+            shard_to_working_param (Dict): ZeRO stage 2 feeds the optimizer a sharded param view to match grad shape.
+                This maps from id(view) to model params used in forward & backward.
+            padding_map (Dict): Per-param padding from ZeRO stage 2
+            is_zero (bool): Whether to use ZeRO stage 2.
+        """
+
+        raise NotImplementedError("setup_distributed for TP/DP isn't supported by this optimizer yet!")
@@ -1,3 +1,4 @@
+import copy
 import os
 from typing import Callable, Optional, Union
 
@@ -74,6 +75,24 @@ def new_from_pretrained(
     subfolder = kwargs.pop("subfolder", "")
     commit_hash = kwargs.pop("_commit_hash", None)
     variant = kwargs.pop("variant", None)
+
+    kwargs.pop("state_dict", None)
+    kwargs.pop("from_tf", False)
+    kwargs.pop("from_flax", False)
+    kwargs.pop("output_loading_info", False)
+    kwargs.pop("trust_remote_code", None)
+    kwargs.pop("low_cpu_mem_usage", None)
+    kwargs.pop("device_map", None)
+    kwargs.pop("max_memory", None)
+    kwargs.pop("offload_folder", None)
+    kwargs.pop("offload_state_dict", False)
+    kwargs.pop("load_in_8bit", False)
+    kwargs.pop("load_in_4bit", False)
+    kwargs.pop("quantization_config", None)
+    kwargs.pop("adapter_kwargs", {})
+    kwargs.pop("adapter_name", "default")
+    kwargs.pop("use_flash_attention_2", False)
+
     use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
 
     if len(kwargs) > 0:
@@ -108,6 +127,10 @@ def new_from_pretrained(
             **kwargs,
         )
     else:
+        config = copy.deepcopy(config)
+        kwarg_attn_imp = kwargs.pop("attn_implementation", None)
+        if kwarg_attn_imp is not None and config._attn_implementation != kwarg_attn_imp:
+            config._attn_implementation = kwarg_attn_imp
         model_kwargs = kwargs
 
     if commit_hash is None:

@@ -55,14 +55,14 @@ def _step(self):
                 self.stats_tool.count_prompt_tokens(new_batch)
                 self.running_batch = new_batch
                 has_new_finished, outputs = self._prefill_batch(self.running_batch)
-                self._filter_runing_batch()
+                self._filter_running_batch()
                 self.has_wait_tokens = 0
 
         else:
             if self.has_wait_tokens < self.max_wait_tokens:
                 self.stats_tool.count_output_tokens(self.running_batch)
                 has_new_finished, outputs = self._decode_batch(self.running_batch)
-                self._filter_runing_batch()
+                self._filter_running_batch()
                 self.has_wait_tokens += 1
 
             else:
@@ -78,7 +78,7 @@ def _step(self):
                 else:
                     self.stats_tool.count_output_tokens(self.running_batch)
                     has_new_finished, outputs = self._decode_batch(self.running_batch)
-                    self._filter_runing_batch()
+                    self._filter_running_batch()
                     self.has_wait_tokens += 1
 
         if has_new_finished:

@@ -131,14 +131,14 @@ def _step(self):
                 self.stats_tool.count_prompt_tokens(new_batch)
                 self.running_batch = new_batch
                 yield from self._prefill_batch(self.running_batch)
-                self._filter_runing_batch()
+                self._filter_running_batch()
                 self.has_wait_tokens = 0
             return
 
         if self.has_wait_tokens < self.max_wait_tokens:
             self.stats_tool.count_output_tokens(self.running_batch)
             yield from self._decode_batch(self.running_batch)
-            self._filter_runing_batch()
+            self._filter_running_batch()
             self.has_wait_tokens += 1
             return
         else:
@@ -154,7 +154,7 @@ def _step(self):
             else:
                 self.stats_tool.count_output_tokens(self.running_batch)
                 yield from self._decode_batch(self.running_batch)
-                self._filter_runing_batch()
+                self._filter_running_batch()
                 self.has_wait_tokens += 1
 
         return
@@ -243,7 +243,7 @@ def _handle_finish_req(self, batch: Batch, has_new_finished_req):
                 self._filter_batch(batch)
             yield from self._output_process(finished_reqs)
 
-    def _filter_runing_batch(self):
+    def _filter_running_batch(self):
         if self.running_batch is not None and self.running_batch.is_clear():
             self.running_batch = None