In [1]:
# !pip -q install transformers datasets accelerate adapter-transformers bitsandbytes
# !git clone https://github.com/jonathan-hsu123/LLM-Adapters.git

In [2]:
# %cd LLM-Adapters
# !pip install -r requirements.txt

In [3]:
# %cd peft
# !pip install .

In [4]:
import sys
import subprocess

def in_venv():
    return sys.prefix != sys.base_prefix

in_venv()


True

In [5]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 134.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [6]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Nov 30 17:28:20 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:17:00.0 Off |                  Off |
| 30%   44C    P8             20W /  300W |   14674MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
# !huggingface-cli login
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import importlib
import math
import re
import warnings
from dataclasses import asdict, dataclass, field
import enum
import json
import os
import sys
from enum import Enum
from typing import List, Optional, Union

import torch
import torch.nn as nn
import torch.nn.functional as F

from huggingface_hub import hf_hub_download
from transformers.activations import ACT2FN
from transformers.utils import PushToHubMixin

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [9]:
# in LLM-Adapters/peft/src/peft/utils/config.py

class PeftType(str, enum.Enum):
    PROMPT_TUNING = "PROMPT_TUNING"
    P_TUNING = "P_TUNING"
    PREFIX_TUNING = "PREFIX_TUNING"
    LORA = "LORA"
    BOTTLENECK = "BOTTLENECK"

class TaskType(str, enum.Enum):
    SEQ_CLS = "SEQ_CLS"
    SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
    CAUSAL_LM = "CAUSAL_LM"
    TOKEN_CLS = "TOKEN_CLS"

@dataclass
class PeftConfigMixin(PushToHubMixin):
    r"""
    This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all
    PEFT adapter models. This class inherits from `transformers.utils.PushToHubMixin` which contains the methods to
    push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a
    directory. The method `from_pretrained` will load the configuration of your adapter model from a directory.

    Args:
        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
    """
    peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})

    @property
    def __dict__(self):
        return asdict(self)

    def to_dict(self):
        return self.__dict__

    def save_pretrained(self, save_directory, **kwargs):
        r"""
        This method saves the configuration of your adapter model in a directory.

        Args:
            save_directory (`str`):
                The directory where the configuration will be saved.
            **kwargs:
                Additional keyword arguments passed along to the `transformers.utils.PushToHubMixin.push_to_hub`
                method.
        """
        if os.path.isfile(save_directory):
            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")

        os.makedirs(save_directory, exist_ok=True)

        output_dict = self.__dict__
        output_path = os.path.join(save_directory, CONFIG_NAME)

        # save it
        with open(output_path, "w") as writer:
            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
        This method loads the configuration of your adapter model from a directory.

        Args:
            pretrained_model_name_or_path (`str`):
                The directory or the hub-id where the configuration is saved.
            **kwargs:
                Additional keyword arguments passed along to the child class initialization.
        """
        if os.path.isfile(os.path.join(pretrained_model_name_or_path, CONFIG_NAME)):
            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
        else:
            try:
                config_file = hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)
            except Exception:
                raise ValueError(f"Can't find config.json at '{pretrained_model_name_or_path}'")

        loaded_attributes = cls.from_json_file(config_file)

        config = cls(**kwargs)

        for key, value in loaded_attributes.items():
            if hasattr(config, key):
                setattr(config, key, value)

        return config

    @classmethod
    def from_json_file(cls, path_json_file, **kwargs):
        r"""
        Loads a configuration file from a json file.

        Args:
            path_json_file (`str`):
                The path to the json file.
        """
        with open(path_json_file, "r") as file:
            json_object = json.load(file)

        return json_object


@dataclass
class PeftConfig(PeftConfigMixin):
    """
    This is the base configuration class to store the configuration of a :class:`~peft.PeftModel`.

    Args:
        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
        task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform.
        inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
    """

    base_model_name_or_path: str = field(default=None, metadata={"help": "The name of the base model to use."})
    peft_type: Union[str, PeftType] = field(default=None, metadata={"help": "Peft type"})
    task_type: Union[str, TaskType] = field(default=None, metadata={"help": "Task type"})
    inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})

In [10]:
# in LLM-Adapters/peft/src/peft/utils/other.py
from transformers import Conv1D


def transpose(weight, fan_in_fan_out):
    return weight.T if fan_in_fan_out else weight


def is_bnb_available():
    return importlib.util.find_spec("bitsandbytes") is not None


if is_bnb_available():
    import bitsandbytes as bnb



class AdapterLayer:
    def __init__(
        self,
        bottleneck_size: int,
        non_linearity: str,
        adapter_dropout: float,
        scaling: Union[float, str],
    ):
        self.bottleneck_size = bottleneck_size
        self.non_linearity = non_linearity
        self.scaling = scaling
        #optional dropout
        if adapter_dropout > 0.0:
            self.adapter_dropout = nn.Dropout(p=adapter_dropout)
        else:
            self.adapter_dropout = lambda x: x
        self.disable_adapters = False


class Linear(nn.Linear, AdapterLayer):
    """
    Bottleneck adapter in a dense layer. The adapter can be applied after the multi-head attention layer and/or
    after the feed-forward layer.
    """
    def __init__(
        self,
        in_features: int,
        out_features: int,
        adapter_type: str,
        bottleneck_size: int,
        non_linearity: str,
        adapter_dropout: float,
        scaling: Union[float, str],
        init_weights: str,
        **kwargs,
    ):
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        AdapterLayer.__init__(self, bottleneck_size=bottleneck_size,
                                non_linearity=non_linearity,
                                adapter_dropout=adapter_dropout,
                                scaling=scaling)

        self.init_weights = init_weights
        self.adapter_type = adapter_type
        if isinstance(scaling, float):
            self.adapter_scaling = scaling
        elif scaling == "learned":
            self.adapter_scaling = nn.Parameter(torch.ones(1))
        # Actual trainable parameters
        self.adapter_down = nn.Linear(in_features, bottleneck_size, bias=False)
        self.adapter_up = nn.Linear(bottleneck_size, out_features, bias=False)
        self.act_fn = ACT2FN[self.non_linearity]
        #Freezing the pre-trained weight matrix
        self.weight.requires_grad = False
        self.reset_parameters()

    def reset_parameters(self):
        nn.Linear.reset_parameters(self)
        # if we want to initialize with the bert strategy then this function is called for all the linear layers
        if hasattr(self, "adapter_down"):
            if self.init_weights == "bert":
                self.adapter_down.apply(self.init_bert_weights)
                self.adapter_up.apply(self.init_bert_weights)
            elif self.init_weights == "mam_adapter":
                nn.init.kaiming_uniform_(self.adapter_down.weight, a=math.sqrt(5))
                nn.init.zeros_(self.adapter_up.weight)
            else:
                raise ValueError("Unknown init_weights type: {}".format(config["init_weights"]))

    # This is copied from the BertPreTrainedModel class to make this a self containing class.
    @staticmethod
    def init_bert_weights(module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # std defaults to 0.02, this might need to be changed
            module.weight.data.normal_(mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def train(self, mode: bool = True):
        nn.Linear.train(self, mode)
        self.adapter_down.train(mode)
        self.adapter_up.train(mode)

    def eval(self):
        nn.Linear.eval(self)
        self.adapter_down.eval()
        self.adapter_up.eval()

    def forward(self, x: torch.Tensor):
        if self.disable_adapters:
            return F.linear(x, self.weight, bias=self.bias)
        else:
            if self.adapter_type == "mh_adapter":
                # for mh_adapter, x will pass the adapter first and then the linear layer
                expected_dtype = x.dtype
                residual = x

                if x.dtype != torch.float32:
                    x = x.float()
                output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling

                output = output + residual

                result = F.linear(output, self.weight, bias=self.bias)
            elif self.adapter_type == "output_adapter":
                # for output_adapter, x will pass the linear layer first and then the adapter
                x = F.linear(x, self.weight, bias=self.bias)
                expected_dtype = x.dtype
                residual = x

                if x.dtype != torch.float32:
                    x = x.float()

                output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling

                result = output + residual
            elif self.adapter_type == "parallel_adapter":
                # for parallel_adapter, x will pass the linear layer first and the adapter layer parallelly.
                # The output of the adapter layer will be added to the output of the linear layer
                result = F.linear(x, self.weight, bias=self.bias)
                expected_dtype = result.dtype

                if x.dtype != torch.float32:
                    x = x.float()
                output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling

                result = result + output
            return result


if is_bnb_available():

    class Linear8bitLt(bnb.nn.Linear8bitLt, AdapterLayer):
        # Aadapter layer for 8bit linear layer
        def __init__(
            self,
            in_features: int,
            out_features: int,
            adapter_type: str,
            bottleneck_size: int,
            non_linearity: str,
            adapter_dropout: float,
            scaling: Union[float, str],
            init_weights: str,
            **kwargs,
        ):
            bnb.nn.Linear8bitLt.__init__(
                self,
                in_features,
                out_features,
                bias=kwargs.get("bias", True),
                has_fp16_weights=kwargs.get("has_fp16_weights", True),
                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
                threshold=kwargs.get("threshold", 0.0),
                index=kwargs.get("index", None),
            )
            AdapterLayer.__init__(
                self,
                bottleneck_size=bottleneck_size,
                non_linearity=non_linearity,
                adapter_dropout=adapter_dropout,
                scaling=scaling,)

            self.init_weights = init_weights
            self.adapter_type = adapter_type
            if isinstance(scaling, float):
                self.adapter_scaling = scaling
            elif scaling == "learned":
                self.adapter_scaling = nn.Parameter(torch.ones(1))
            # Actual trainable parameters
            self.adapter_down = nn.Linear(in_features, bottleneck_size, bias=False)
            self.adapter_up = nn.Linear(bottleneck_size, out_features, bias=False)
            self.act_fn = ACT2FN[self.non_linearity]
            #Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
            self.reset_parameters()

        def reset_parameters(self):
            nn.Linear.reset_parameters(self)
            # if we want to initialize with the bert strategy then this function is called for all the linear layers
            if hasattr(self, "adapter_down"):
                if self.init_weights == "bert":
                    self.adapter_down.apply(self.init_bert_weights)
                    self.adapter_up.apply(self.init_bert_weights)
                elif self.init_weights == "mam_adapter":
                    nn.init.kaiming_uniform_(self.adapter_down.weight, a=math.sqrt(5))
                    nn.init.zeros_(self.adapter_up.weight)
                else:
                    raise ValueError("Unknown init_weights type: {}".format(config["init_weights"]))

        # This is copied from the BertPreTrainedModel class to make this a self containing class.
        @staticmethod
        def init_bert_weights(module):
            """Initialize the weights."""
            if isinstance(module, (nn.Linear, nn.Embedding)):
                # std defaults to 0.02, this might need to be changed
                module.weight.data.normal_(mean=0.0, std=0.02)
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()

        def forward(self, x: torch.Tensor):
            result_pre_forward = super().forward(x)

            if self.disable_adapters:
                return result_pre_forward
            else:
                if self.adapter_type == "mh_adapter":
                    if not torch.is_autocast_enabled():
                        expected_dtype = x.dtype

                        if x.dtype != torch.float32:
                            x = x.float()

                        residual = x
                        output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
                        output = (output + residual).to(expected_dtype)

                        result = super().forward(output)
                    else:
                        residual = x
                        output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))) * self.adapter_scaling
                        output = output + residual

                        print(output.shape, "2")

                        result = super().forward(output)
                elif self.adapter_type == "output_adapter":
                    if not torch.is_autocast_enabled():
                        expected_dtype = result_pre_forward.dtype

                        if result_pre_forward.dtype != torch.float32:
                            result_pre_forward = result_pre_forward.float()

                        residual = result_pre_forward
                        output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(result_pre_forward)))).to(expected_dtype) * self.adapter_scaling
                        result = (output + residual).to(expected_dtype)
                    else:
                        residual = result_pre_forward
                        output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(result_pre_forward)))) * self.adapter_scaling
                        result = output + residual
                elif self.adapter_type == "parallel_adapter":
                    if not torch.is_autocast_enabled():
                        expected_dtype = result_pre_forward.dtype

                        if x.dtype != torch.float32:
                            x = x.float()

                        output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
                        result = result_pre_forward + output
                    else:
                        output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))) * self.adapter_scaling
                        result = result_pre_forward + output

                return result


class Conv1DAdapter(Conv1D, AdapterLayer):
    def __init__(
        self,
        nf: int,
        nx: int,
        adapter_type: str,
        bottleneck_size: int,
        non_linearity: str,
        adapter_dropout: float,
        scaling: Union[float, str],
        init_weights: str,
        **kwargs,
    ):
        Conv1D.__init__(self, nf, nx)
        AdapterLayer.__init__(
            self,
            bottleneck_size=bottleneck_size,
            non_linearity=non_linearity,
            adapter_dropout=adapter_dropout,
            scaling=scaling,
        )

        self.nx = nx
        self.nf = nf
        self.init_weights = init_weights
        self.adapter_type = adapter_type
        if isinstance(scaling, float):
            self.adapter_scaling = scaling
        elif scaling == "learned":
            self.adapter_scaling = nn.Parameter(torch.ones(1))
        else:
            self.adapter_scaling = 1.0  # Default scaling
        # Adapter layers
        self.adapter_down = Conv1D(bottleneck_size, nx)
        self.adapter_up = Conv1D(nf, bottleneck_size)
        self.act_fn = ACT2FN[self.non_linearity]
        # Freeze the pre-trained weights
        self.weight.requires_grad = False
        self.bias.requires_grad = False
    def train(self, mode: bool = True):
        torch.nn.Conv1d.train(self, mode)
        self.adapter_down.train(mode)
        self.adapter_up.train(mode)
    def eval(self):
        torch.nn.Conv1d.eval(self)
        self.adapter_down.eval()
        self.adapter_up.eval()
    def forward(self, x: torch.Tensor):
        if self.disable_adapters:
            return super().forward(x)
        else:
            if self.adapter_type == "mh_adapter":
                # For mh_adapter, x will pass the adapter first and then the convolutional layer
                expected_dtype = x.dtype
                residual = x
                if x.dtype != torch.float32:
                    x = x.float()
                output = (
                    self.adapter_up(
                        self.act_fn(self.adapter_down(self.adapter_dropout(x)))
                    ).to(expected_dtype)
                    * self.adapter_scaling
                )
                output = output + residual

                # print(output.size()[:-1] + (self.nf,))
                # print(output.shape, residual.shape, "1")
                # x = x.view(*size_out)
                result = F.linear(output, self.weight.T, bias=self.bias)

                # print(output.shape, result.shape)
                # result = super().forward(output)
                # result = output + residual
            elif self.adapter_type == "output_adapter":
                # For output_adapter, x will pass the convolutional layer first and then the adapter
                # x = super().forward(x)
                x = F.linear(x, self.weight.T, bias=self.bias)
                expected_dtype = x.dtype
                residual = x
                if x.dtype != torch.float32:
                    x = x.float()
                output = (
                    self.adapter_up(
                        self.act_fn(self.adapter_down(self.adapter_dropout(x)))
                    ).to(expected_dtype)
                    * self.adapter_scaling
                )
                result = output + residual
            elif self.adapter_type == "parallel_adapter":
                # For parallel_adapter, x will pass the convolutional layer first and the adapter layer parallelly.
                # The output of the adapter layer will be added to the output of the convolutional layer
                result = super().forward(x)
                expected_dtype = result.dtype
                if x.dtype != torch.float32:
                    x = x.float()
                output = (
                    self.adapter_up(
                        self.act_fn(self.adapter_down(self.adapter_dropout(x)))
                    ).to(expected_dtype)
                    * self.adapter_scaling
                )
                result = result + output
            else:
                raise ValueError(
                    f"Unknown adapter_type: {self.adapter_type}. Expected one of ['mh_adapter', 'output_adapter', 'parallel_adapter']."
                )
            return result

In [11]:
TRANSFORMERS_MODELS_TO_ADAPTER_TYPE_MAPPING = {
    "bloom": {"dense_h_to_4h": "mh_adapter", "dense_4h_to_h": "output_adapter"},
    "gptj": {"fc_in":"mh_adapter", "fc_out":"output_adapter"},
    "gpt_neo": {"c_fc":"mh_adapter", "c_proj":"output_adapter"},
    "gpt2": {"c_fc":"mh_adapter", "c_proj":"output_adapter"},
    "llama": {"gate_proj": "mh_adapter", "up_proj":"mh_adapter", "down_proj":"output_adapter"},
    "opt": {"fc1":"mh_adapter", "fc2":"output_adapter"},
    "chatglm": {"dense_h_to_4h": "mh_adapter", "dense_4h_to_h": "output_adapter"},
}

@dataclass
class BottleneckConfig(PeftConfig):
    """
    This is the configuration class to store the configuration of a [`~peft.Bottleneck`].

    Args:
        bottleneck_size (`int`): The size of the bottleneck.
        non_linearity (`str`): The non-linearity to apply to the bottleneck.
        dropout (`float`, optional): The dropout probability of the bottleneck. Default to 0.0
        bias ('str'): Bias type for Bottleneck. Can be 'none', 'all' or 'adapter_only'. Default to 'none'.
        use_parallel_adapter (:obj:`bool`, optional): Whether to use parallel adapter. Defaults to False.
        scaling (:obj:`float` or :obj:`str`, optional):
            Scaling factor to use for scaled addition of adapter outputs as done by He et al. (2021). Can be either a
            constant factor (float) or the string "learned", in which case the scaling factor is learned. Defaults to
            1.0.
        target_modules (`Union[List[str],str]`): The names of the modules to apply Adapter to.
        init_weights (:obj:`str`, optional): Initialization method for the weights of the adapter modules.
            Currently, this can be either "bert" (default) or "mam_adapter".
        modules_to_save (`List[str]`):List of modules apart from Bottleneck adapter layers to be set as trainable
            and saved in the final checkpoint.
    """
    bottleneck_size : int = field(default=256, metadata={"help": "The size of the bottleneck"})
    non_linearity : str = field(default="tanh", metadata={"help": "The non-linearity to apply to the bottleneck"})
    adapter_dropout : float = field(default=0.0, metadata={"help": "The dropout probability of the bottleneck, default to 0.0"})
    target_modules: Optional[Union[List[str], str]] = field(
        default=None,
        metadata={
            "help": "List of module names or regex expression of the module names to replace with Adapter."
            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
        },
    )
    use_parallel_adapter: bool = field(default=False, metadata={"help": "Whether to use parallel adapter"})
    use_adapterp: bool = field(default=False, metadata={"help": "Whether to use adapterp"})
    scaling: Union[float, str] = 1.0
    bias: str = field(default="none", metadata={"help": "Bias type for Bottleneck. Can be 'none', 'all' or 'adapter_only'"})
    init_weights: str = field(default="bert", metadata={"help": "Initialization method for the weights of the adapter modules."})
    modules_to_save: Optional[List[str]] = field(
        default=None,
        metadata={
            "help": "List of modules apart from Adapter layers to be set as trainable and saved in the final checkpoint. "
            "For example, in Sequence Classification or Token Classification tasks, "
            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
        },
    )

    def __post_init__(self):
        self.peft_type = PeftType.BOTTLENECK

class BottleneckModel(torch.nn.Module):
    """
    Creates Bottleneck adapter model for a pretrained trainsformers model.

    Args:
        model ('transformers.PreTrainedModel'): The pretrained model to be adapted.
        config (`BottleneckConfig`): The configuration of the Bottleneck adapter.

    Returns:
        `torch.nn.Module`: The Bottleneck adapter model.

    Example::

        >>> from transformers import AutoModelForCausalLM, BottleneckConfig
        >>> from peft import BottleneckModel, BottleneckConfig
        >>> config = BottleneckConfig(
            peft_type="BOTTLNECK", task="CAUSAL_LM", target_modules=["gate_proj", "up_proj", "down_proj"],
            bottleneck_size=256, non_linearity="tanh",
        )
        >>> model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
        >>> bottleneck_model = BottleneckModel(config, model)

    **Attribute**:
        - **model** (`transformers.PreTrainedModel`): The pretrained model to be adapted.
        - **peft_config** (`BottleneckConfig`): The configuration of the Bottleneck adapter.
    """

    def __init__(self, config, model):
        super().__init__()
        self.model = model
        self.peft_config = config
        self._find_and_replace()
        mark_only_adapter_as_trainable(self.model, self.peft_config.bias)
        self.forward = self.model.forward
        print(self.forward)

    def _find_and_replace(self):
        loaded_in_8bit = getattr(self.model, "is_loaded_in_8bit", False)
        if loaded_in_8bit and not is_bnb_available():
            raise ImportError(
                "To use Adapter with 8-bit quantization, please install the `bitsandbytes` package. "
                "You can install it with `pip install bitsandbytes`."
            )
        is_target_modules_in_base_model = False
        is_hf_device_map_available = hasattr(self.model, "hf_device_map")
        kwargs = {
            "bottleneck_size": self.peft_config.bottleneck_size,
            "non_linearity": self.peft_config.non_linearity,
            "adapter_dropout": self.peft_config.adapter_dropout,
            "scaling": self.peft_config.scaling,
            "init_weights": self.peft_config.init_weights,
        }
        key_list = [key for key, _ in self.model.named_modules()]
        for key in key_list:
            if isinstance(self.peft_config.target_modules, str):
                target_module_found = re.fullmatch(self.peft_config.target_modules, key)
            else:
                target_module_found = any(key.endswith(target_key) for target_key in self.peft_config.target_modules)
            if target_module_found:
                if not is_target_modules_in_base_model:
                    is_target_modules_in_base_model = True
                parent, target, target_name = self._get_submodules(key)
                # determine the type of adapter to be used, this will effect the forward pass
                if self.peft_config.use_parallel_adapter:
                    adapter_type = "parallel_adapter"
                else:
                    adapter_type = TRANSFORMERS_MODELS_TO_ADAPTER_TYPE_MAPPING[self.model.config.model_type][target_name]
                kwargs.update({"adapter_type": adapter_type})

                bias = target.bias is not None
                # print(loaded_in_8bit, isinstance(target, bnb.nn.Linear8bitLt), target, isinstance(target, Conv1D))
                if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
                    kwargs.update(
                        {
                            "has_fp16_weights": target.state.has_fp16_weights,
                            "memory_efficient_backward": target.state.memory_efficient_backward,
                            "threshold": target.state.threshold,
                            "index": target.index,
                        }
                    )
                    if adapter_type == "mh_adapter":
                        new_module = Linear8bitLt(target.in_features, target.in_features, bias=bias, **kwargs)
                    elif adapter_type == "output_adapter":
                        new_module = Linear8bitLt(target.out_features, target.out_features, bias=bias, **kwargs)
                    elif adapter_type == "parallel_adapter":
                        new_module = Linear8bitLt(target.in_features, target.out_features, bias=bias, **kwargs)
                elif isinstance(target, torch.nn.Linear):
                    if adapter_type == "mh_adapter":
                        new_module = Linear(target.in_features, target.in_features, bias=bias, **kwargs)
                    elif adapter_type == "output_adapter":
                        new_module = Linear(target.out_features, target.out_features, bias=bias, **kwargs)
                    elif adapter_type == "parallel_adapter":
                        new_module = Linear(target.in_features, target.out_features, bias=bias, **kwargs)
                # elif isinstance(target, Conv1D):
                #     if adapter_type == "mh_adapter":
                #         new_module = Linear(target.nx, target.nx, bias=bias, **kwargs)
                #     elif adapter_type == "output_adapter":
                #         new_module = Linear(target.nf, target.nf, bias=bias, **kwargs)
                #     elif adapter_type == "parallel_adapter":
                #         new_module = Linear(target.nx, target.nf, bias=bias, **kwargs)
                elif isinstance(target, Conv1D):
                    if adapter_type == "mh_adapter":
                        nx = target.nx  # Input features
                        nf = target.nx  # Output features
                    elif adapter_type == "output_adapter":
                        nx = target.nf               # Input features
                        nf = target.nf               # Output features
                    elif adapter_type == "parallel_adapter":
                        nx = target.nx  # Input features
                        nf = target.nf               # Output features
                    new_module = Conv1DAdapter(
                      nf=nf,
                      nx=nx,
                      **kwargs
                    )
                self._replace_module(parent, target_name, new_module, target)
        if not is_target_modules_in_base_model:
            raise ValueError(
                f"Target modules {self.peft_config.target_modules} not found in the base model. "
                f"Please check the target modules and try again."
            )

    def _get_submodules(self, key):
        parent = self.model.get_submodule(".".join(key.split(".")[:-1]))
        target_name = key.split(".")[-1]
        target = self.model.get_submodule(key)
        return parent, target, target_name

    def _replace_module(self, parent_module, child_name, new_module, old_module):
        setattr(parent_module, child_name, new_module)
        new_module.weight = old_module.weight
        if old_module.bias is not None:
            new_module.bias = old_module.bias
        if getattr(old_module, "state", None) is not None:
            new_module.state = old_module.state
            new_module.to(old_module.weight.device)

        # dispatch to correct device
        for name, module in new_module.named_modules():
            if "adapter_" in name:
                module.to(old_module.weight.device)

    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            return getattr(self.model, name)

    @property
    def modules_to_save(self):
        return None

    def get_peft_config_as_dict(self, inference: bool = False):
        config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(self.peft_config).items()}
        if inference:
            config["inference_mode"] = True
        return config

    def _set_adapter_layers(self, enabled=True):
        for module in self.model.modules():
            if isinstance(module, AdapterLayer):
                module.disable_adapters = False if enabled else True

    def enable_adapter_layers(self):
        self._set_adapter_layers(enabled=True)

    def disable_adapter_layers(self):
        self._set_adapter_layers(enabled=False)


# Below code is based on https://github.com/adapter-hub/adapter-transformers/blob/master/src/transformers/adapters/modeling.py and lora.py from huggingfance PEFT
# and modified to work with PyTorch FSDP


#  ------------------------------------------------------------------------------------------
#  Copyright (c) Microsoft Corporation. All rights reserved.
#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
#  ------------------------------------------------------------------------------------------


# Copy from lora.py
# had to adapt it for `lora_only` to work
def mark_only_adapter_as_trainable(model: nn.Module, bias: str = "none") -> None:
    for n, p in model.named_parameters():
        if "adapter_" not in n:
            p.requires_grad = False
    if bias == "none":
        return
    elif bias == "all":
        for n, p in model.named_parameters():
            if "bias" in n:
                p.requires_grad = True
    elif bias == "adapter_only":
        for m in model.modules():
            if isinstance(m, AdapterLayer) and hasattr(m, "bias") and m.bias is not None:
                m.bias.requires_grad = True
    else:
        raise NotImplementedError


In [12]:
from datasets import load_dataset
dataset = load_dataset("boolq")

Using custom data configuration boolq-150dc58bba7bf36d
Found cached dataset parquet (/u/poyuchen/.cache/huggingface/datasets/parquet/boolq-150dc58bba7bf36d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def format_example(example):
    question = example['question']
    passage = example['passage']
    answer = 'Yes' if example['answer'] else 'No'
    prompt = f"Question: {question}\nPassage: {passage}\nAnswer:"
    return {'input_text': prompt, 'target_text': answer}
formatted_dataset = dataset.map(format_example, remove_columns=dataset['train'].column_names)

Loading cached processed dataset at /u/poyuchen/.cache/huggingface/datasets/parquet/boolq-150dc58bba7bf36d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a1e8f8b774871ade.arrow
Loading cached processed dataset at /u/poyuchen/.cache/huggingface/datasets/parquet/boolq-150dc58bba7bf36d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-d9531fa37afdd21a.arrow


In [14]:
from transformers import AutoTokenizer

# tokenizer = LlamaTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    inputs = [f"{input_text} {target_text}" for input_text, target_text in zip(examples['input_text'], examples['target_text'])]

    # Tokenize the combined text and return PyTorch tensors
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt',  # Ensure tensors are returned
    )

    # Create labels by cloning the input IDs
    labels = model_inputs['input_ids'].clone()

    # Determine the length of the input_text for each example
    input_lengths = []
    for input_text in examples['input_text']:
        input_ids = tokenizer(
            input_text,
            max_length=512,
            truncation=True,
            add_special_tokens=False,  # Do not add special tokens here
        )['input_ids']
        input_lengths.append(len(input_ids))

    # Mask the labels for the input_text portion
    for i, input_length in enumerate(input_lengths):
        labels[i, :input_length] = -100  # Use -100 to ignore the input tokens

    model_inputs['labels'] = labels

    return model_inputs

tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset['train'].column_names,
)

Loading cached processed dataset at /u/poyuchen/.cache/huggingface/datasets/parquet/boolq-150dc58bba7bf36d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b3d5c1713fbdc4df.arrow


  0%|          | 0/10 [00:00<?, ?ba/s]

In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We are not using masked language modeling
)

In [16]:
# model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf', load_in_8bit=True, device_map='auto')
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")  

config = BottleneckConfig(
    bottleneck_size=256,
    non_linearity="tanh",
    adapter_dropout=0.0,
    use_parallel_adapter=True,
    use_adapterp=False,
    scaling=1.0,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["gate_proj", "up_proj", "down_proj"],
)
model = BottleneckModel(config, model)

<bound method LlamaForCausalLM.forward of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(
            in_features=2048, out_features=8192, bias=False
            (adapter_down): Linear(in_features=2048, out_features=256, bias=False)
            (adapter_up): Linear(in_features=256, out_features=8192, bias=False)
            (act_fn): Tanh()
          )
          (up_proj): Linear(
            in_features=2048, out_features=8192, bias=False

In [17]:
# Detailed parameter counting
total_params = 0
trainable_params = 0
non_trainable_params = 0

for name, parameter in model.named_parameters():
    param_count = parameter.numel()
    total_params += param_count

    if parameter.requires_grad:
        trainable_params += param_count
    else:
        non_trainable_params += param_count

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")
print(f"percentage: {(trainable_params/total_params)*100:.4f}%")

Total parameters: 1,361,643,520
Trainable parameters: 125,829,120
Non-trainable parameters: 1,235,814,400
percentage: 9.2410%


In [18]:
from transformers import TrainingArguments

# Set the pad_token to the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Update model's config
model.config.pad_token_id = tokenizer.pad_token_id
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    # evaluation_strategy='steps',
    # eval_steps=1000,
    save_steps=3000,
    logging_dir='./logs',
    logging_steps=1000,
    learning_rate=5e-4,
    report_to='none',
    save_safetensors=False,
)
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [19]:
trainer.train()

Step,Training Loss
1000,3.0593
2000,2.8622
3000,2.2078
4000,1.9169
5000,1.5556
6000,0.8139
7000,0.7614


TrainOutput(global_step=7071, training_loss=1.8709106342810382, metrics={'train_runtime': 5435.8731, 'train_samples_per_second': 5.203, 'train_steps_per_second': 1.301, 'total_flos': 9.547812414318182e+16, 'train_loss': 1.8709106342810382, 'epoch': 3.0})

In [20]:
from datasets import load_dataset

# Load the validation split
dataset = load_dataset("boolq", split='validation')
formatted_dataset = dataset.map(format_example)

Using custom data configuration boolq-150dc58bba7bf36d
Found cached dataset parquet (/u/poyuchen/.cache/huggingface/datasets/parquet/boolq-150dc58bba7bf36d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3270 [00:00<?, ?ex/s]

In [21]:
import torch
model.eval()

def generate_answer(example):
    input_text = example['input_text']

    # Tokenize and move inputs to the model's device
    inputs = tokenizer(
        input_text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length',
    )
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)

    # Generate the model's output
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=15,
            do_sample=False,
            num_beams=1,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode the generated tokens
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract the answer by removing the input prompt
    answer = generated_text[len(input_text):].strip()

    # Handle empty outputs
    if not answer:
        answer = "No"
        return {'predicted_answer': answer}

    # # Keep only the first word (Yes or No)
    # answer = answer.split()[0]
    # print(answer)

    # # Handle unexpected outputs
    if 'yes' in answer.lower():
        answer = 'Yes'
    elif 'no' in answer.lower():
        answer = 'No'
    else:
        answer = 'No'

    return {'predicted_answer': answer}


# Map the generate_answer function over the validation set
predictions = formatted_dataset.map(generate_answer, batched=False)




  0%|          | 0/3270 [00:00<?, ?ex/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [22]:
import numpy as np

def compute_accuracy(predictions):
    # Convert the true and predicted answers to lowercase for consistency
    true_answers = [answer.lower() for answer in predictions['target_text']]
    predicted_answers = [answer.lower() for answer in predictions['predicted_answer']]

    # Calculate the number of correct predictions
    correct_predictions = sum([
        true == pred for true, pred in zip(true_answers, predicted_answers)
    ])

    # Compute accuracy
    accuracy = correct_predictions / len(true_answers)
    return accuracy

# Calculate accuracy
accuracy = compute_accuracy(predictions)
print(f"Accuracy on the validation set: {accuracy * 100:.2f}%")

Accuracy on the validation set: 40.61%
