In [1]:
pip install -qU transformers

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np

In [3]:
!huggingface-cli login --token

usage: huggingface-cli <command> [<args>] login [-h] [--token TOKEN] [--add-to-git-credential]
huggingface-cli <command> [<args>] login: error: argument --token: expected one argument


In [38]:
class QuantizerLayer(nn.Module):
    def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
        super().__init__()

        self.register_buffer("weight", torch.randint(-128,127,(out_features, in_features)).to(torch.int8))

        self.register_buffer("scale", torch.randn((out_features),dtype=dtype))
        if bias:
            self.register_buffer("bias",torch.randn((1, out_features),dtype=dtype))
        else:
            self.bias = None


    def quantize(self, weight):
        weight_f32 = weight.clone().to(torch.float32)

        Qmin = torch.iinfo(torch.int8).min
        Qmax = torch.iinfo(torch.int8).max
        scale = weight_f32.abs().max(dim=-1).values/127
        scale = scale.to(weight.dtype)
        quantized_weight = torch.clamp(torch.round(weight/scale.unsqueeze(-1)), Qmin, Qmax).to(torch.int8)

        self.weight = quantized_weight
        self.scale = scale


    def forward(self, input):
      output = F.linear(input, self.weight.to(input.dtype)) * self.scale
      if self.bias is not None:
          output = output +self.bias

      return output


In [39]:
def replace_linearLayer(base_model, quantizer_class, exclude_list, quantized=True):
    for name, child in base_model.named_children():
        if isinstance(child, nn.Linear) and not any([x == name for x in exclude_list]):
            old_bias = child.bias
            old_weight = child.weight
            in_features = child.in_features
            out_features = child.out_features

            quiantizer_layer = quantizer_class(in_features, out_features, bias=old_bias is not None, dtype=old_weight.dtype)

            setattr(base_model, name, quiantizer_layer)

            if quantized:
                getattr(base_model, name).quantize(old_weight)

            if old_bias is not None:
                getattr(base_model, name).bias = old_bias

        else:
            replace_linearLayer(child, quantizer_class, exclude_list, quantized=quantized)



In [40]:
tokenzier = AutoTokenizer.from_pretrained("facebook/opt-350m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16)

In [41]:
print(model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [42]:
model_memory_size_before_quantization = model.get_memory_footprint()
print(f"Total memory size before quantization (in GB): {model_memory_size_before_quantization / 1e+9}")

Total memory size before quantization (in GB): 0.662392832


In [43]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenzier)
pipe("Hello There, ", max_new_tokens=50)

[{'generated_text': "Hello There,  I have an egg that matches your TSV. Could you hatch it for me?   Thanks in advance!\nSure thing! I'll be available for the next few hours.\nI'm available now.\nI'm available now.\n"}]

In [44]:
pipe("India is a beautiful country and ", max_new_tokens=50)

[{'generated_text': "India is a beautiful country and  I would love to visit it. But I don't think I can afford to go there.\nI'm sure you can afford to go there."}]

In [45]:
replace_linearLayer(model, QuantizerLayer, ["lm_head"], quantized=True)

In [46]:
print(model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): QuantizerLayer()
      (project_in): QuantizerLayer()
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): QuantizerLayer()
            (v_proj): QuantizerLayer()
            (q_proj): QuantizerLayer()
            (out_proj): QuantizerLayer()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantizerLayer()
          (fc2): QuantizerLayer()
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(in_features=512, out_features=50272, bias=False)
)


In [47]:
model_memory_size_before_quantization = model.get_memory_footprint()
print(f"Total memory size before quantization (in GB): {model_memory_size_before_quantization / 1e+9}")

Total memory size before quantization (in GB): 0.359799808


In [48]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenzier)
pipe("India is a beautiful country and  ", max_new_tokens=50)

[{'generated_text': "India is a beautiful country and   I'm sure you'll be able to find a job there.                                     "}]