In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [2]:

tokenizer = AutoTokenizer.from_pretrained(
  'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
  bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
)
model = AutoModelForCausalLM.from_pretrained(
  'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
  pad_token_id=tokenizer.eos_token_id, low_cpu_mem_usage=True, device_map='auto', load_in_8bit=True
)



In [20]:
with torch.no_grad():
    prompt = tokenizer.encode("한화의 김성근 감독님", return_tensors='pt').to(device='cuda', non_blocking=True)
    out = model.generate(**prompt, min_length=128, max_length=128, do_sample=True)
    generated = tokenizer.batch_decode(out)[0]
    
print(generated)



RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [3]:
for param in model.parameters():
    param.requires_grad = False

In [4]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd

In [5]:
class LoRALinear(bitsandbytes.nn.Linear8bitLt):
    pass

class LoRAEmbedding(bitsandbytes.nn.Linear8bitLt):
    pass

In [11]:
def add_adapters(model, adapter_dim=16):
    assert adapter_dim > 0

    for module in model.modules():
        if type(module) == bitsandbytes.nn.Linear8bitLt or type(module) == nn.Linear:
            module.adapter = nn.Sequential(
                LoRALinear(module.in_features, adapter_dim, bias=False),
                LoRALinear(adapter_dim, module.out_features, bias=False),
            )
            nn.init.zeros_(module.adapter[1].weight)
        elif type(module) == nn.Embedding:
            module.adapter = nn.Sequential(
                LoRAEmbedding(module.num_embeddings, adapter_dim),
                LoRALinear(adapter_dim, module.embedding_dim, bias=False),
            )
            nn.init.zeros_(module.adapter[1].weight)

add_adapters(model)

In [14]:
print(model)

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(
      64512, 4096
      (adapter): Sequential(
        (0): LoRAEmbedding(in_features=64512, out_features=16, bias=True)
        (1): LoRALinear(in_features=16, out_features=4096, bias=False)
      )
    )
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear8bitLt(
            in_features=4096, out_features=4096, bias=False
            (adapter): Sequential(
              (0): LoRALinear(in_features=4096, out_features=16, bias=False)
              (1): LoRALinear(in_features=16, out_features=4096, bias=False)
            )
          )
          (v_proj): Linear8bitLt(
            in_features=4096, out_features=4096, bias=False
            (adapter): S