In [104]:
import numpy as np
from transformers import GPT2Tokenizer, TFGPT2Model

In [1]:
!pip install transformers accelerate bitsandbytes>0.37.0

In [139]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [153]:
mem = sum([param.nelement() * param.element_size() for param in model.parameters()]) # total number of parameter * 4 (32 bit = 4 byte)
mem

497759232

In [158]:
print(f"size in gb {mem/(1024*1024*1024)}")

size in gb 0.4635744094848633


# 8 Bit model

In [11]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model_8bit = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2",
    quantization_config=quantization_config
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [74]:
q_mf = model_8bit.get_memory_footprint()
print(q_mf)

ratio = mf/q_mf
ratio

176527896


2.8910002530138352

In [75]:
model_8bit

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear8bitLt(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear8bitLt(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear8bitLt(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear8bitLt(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwi

In [76]:
model_8bit.transformer.h[0].mlp.c_fc.weight.shape

torch.Size([3072, 768])

In [77]:
model_8bit.transformer.h[0].mlp.c_fc.weight

Parameter containing:
Parameter(Int8Params([[ 25, -34,  13,  ...,  13,   9,  -8],
            [ 26, -18, -10,  ...,  42,  34,   0],
            [ -7,   7, -11,  ...,   0, -10, -11],
            ...,
            [-43,  47,  16,  ..., -96, -47, -10],
            [ 36, -29, -17,  ...,  22,  31,  13],
            [ 18, -16,  21,  ...,   6, -11,  49]], device='cuda:0',
           dtype=torch.int8))

In [78]:
print(model_8bit.transformer.h[0].mlp.c_fc.weight.max())
print(model_8bit.transformer.h[0].mlp.c_fc.weight.min())

tensor(127, device='cuda:0', dtype=torch.int8)
tensor(-127, device='cuda:0', dtype=torch.int8)


# 4 bit model

In [84]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

model_4bit = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2",
    quantization_config=nf4_config
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [85]:
q4_mf = model_4bit.get_memory_footprint()
print(q4_mf)

ratio = mf/q4_mf
print('origina to q4 ration', ratio)


ratio = q_mf/q4_mf
print('q8 to q4 ration', ratio)

134060568
origina to q4 ration 3.8068031458735874
q8 to q4 ration 1.3167771749258887


In [86]:
model_4bit

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affin

In [90]:
print(model_4bit.transformer.h[0].mlp.c_fc.weight.max())
print(model_4bit.transformer.h[0].mlp.c_fc.weight.min())

tensor(255, device='cuda:0', dtype=torch.uint8)
tensor(0, device='cuda:0', dtype=torch.uint8)


# Latence

In [114]:
model = GPT2Model.from_pretrained('gpt2').to(device)

In [115]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt').to(device)

In [116]:
%%time
output = model(**encoded_input)

CPU times: user 128 ms, sys: 53.2 ms, total: 182 ms
Wall time: 682 ms


In [117]:
%%time
output = model_8bit(**encoded_input)

CPU times: user 93 ms, sys: 4.83 ms, total: 97.8 ms
Wall time: 223 ms


In [118]:
%%time
output = model_4bit(**encoded_input)

CPU times: user 37.1 ms, sys: 0 ns, total: 37.1 ms
Wall time: 75.9 ms


