huggingface · BenjaminBossan · Apr 5, 2024 · Mar 25, 2024 · Mar 25, 2024 · arash2060
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -172,6 +172,7 @@ def loftq_init(self, adapter_name):
 
     def _get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
         # calculate L2 norm of weight matrix, column-wise
+        weight = transpose(weight, self.fan_in_fan_out)
         weight = weight + scaling * lora_weight
         weight_norm = torch.linalg.norm(weight, dim=1).to(weight.dtype)
         return weight_norm
@@ -395,13 +396,16 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                     else:
                         # handle dora
                         # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(orig_weights, delta_weight, scaling=1).detach()
+                        weight_norm = self._get_weight_norm(
+                            orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1
+                        ).detach()
                         # We need to cache weight_norm because it has to be based on the original weights. We
                         # cannot calculate it on the fly based on the merged weights when unmerging because its a
                         # different value
                         self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
                         dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                        orig_weights = dora_factor.view(-1, 1) * (orig_weights + delta_weight)
+                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
+                        orig_weights = dora_factor * (orig_weights + delta_weight)
 
                     if not torch.isfinite(orig_weights).all():
                         raise ValueError(
@@ -416,13 +420,16 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                     else:
                         # handle dora
                         # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(base_layer.weight, delta_weight, scaling=1).detach()
+                        weight_norm = self._get_weight_norm(
+                            base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
+                        ).detach()
                         # We need to cache weight_norm because it has to be based on the original weights. We
                         # cannot calculate it on the fly based on the merged weights when unmerging because its a
                         # different value
                         self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
                         dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                        new_weight = dora_factor.view(-1, 1) * (base_layer.weight.data + delta_weight)
+                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
+                        new_weight = dora_factor * (base_layer.weight.data + delta_weight)
                         base_layer.weight.data = new_weight
 
                 self.merged_adapters.append(active_adapter)

diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -986,6 +986,22 @@ def test_double_wrapping_merge_and_unload(self, method):
 
         assert isinstance(unloaded.classifier, nn.Linear)
 
+    def test_gpt2_dora_merge_and_unload(self):
+        # see https://github.com/huggingface/peft/pull/1588#discussion_r1537914207
+        model = AutoModelForCausalLM.from_pretrained("gpt2")
+        config = LoraConfig(task_type="CAUSAL_LM", use_dora=True)
+        model = get_peft_model(model, config)
+        # should not raise an error
+        model.merge_and_unload()
+
+    def test_gpt2_dora_merge_and_unload_safe_merge(self):
+        # see https://github.com/huggingface/peft/pull/1588#discussion_r1537914207
+        model = AutoModelForCausalLM.from_pretrained("gpt2")
+        config = LoraConfig(task_type="CAUSAL_LM", use_dora=True)
+        model = get_peft_model(model, config)
+        # should not raise an error
+        model.merge_and_unload(safe_merge=True)
+
 
 class TestMultiRankAdapter(unittest.TestCase):
     """Tests related to multirank LoRA adapters"""

diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
@@ -814,7 +814,7 @@ def test_causal_lm_training_4bit_dora(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id,
-                load_in_4bit=True,
+                quantization_config=BitsAndBytesConfig(load_in_4bit=True),
                 device_map="auto",
             )
 
@@ -872,7 +872,7 @@ def test_causal_lm_training_multi_gpu_4bit_dora(self):
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id,
                 device_map="auto",
-                load_in_4bit=True,
+                quantization_config=BitsAndBytesConfig(load_in_4bit=True),
             )
 
             assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count()))
@@ -931,7 +931,7 @@ def test_causal_lm_training_8bit_dora(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id,
-                load_in_8bit=True,
+                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
                 device_map="auto",
             )
 
@@ -989,7 +989,7 @@ def test_causal_lm_training_multi_gpu_8bit_dora(self):
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id,
                 device_map="auto",
-                load_in_8bit=True,
+                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
             )
 
             assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count()))
@@ -1040,6 +1040,57 @@ def test_causal_lm_training_multi_gpu_8bit_dora(self):
             # assert loss is not None
             assert trainer.state.log_history[-1]["train_loss"] is not None
 
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_gpt2_dora(self):
+        r"""
+        Same as test_causal_lm_training_4bit but with DoRA
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto")
+
+            tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
+            model = prepare_model_for_kbit_training(model)
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+                use_dora=True,
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            assert "adapter_config.json" in os.listdir(tmp_dir)
+            assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir)
+
+            # assert loss is not None
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
 
 @require_torch_gpu
 @require_auto_gptq