In [1]:
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch

model_id = 'tiiuae/falcon-7b-instruct'

nf4_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type='nf4',
 bnb_4bit_use_double_quant=True,
 bnb_4bit_compute_dtype=torch.bfloat16
)

model_nf4 = AutoModelForCausalLM.from_pretrained(
  model_id,
  quantization_config=nf4_config,
  trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from transformers import AutoTokenizer
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained(model_id)

pipeline = pipeline(
  'text-generation',
  model=model_nf4,
  tokenizer=tokenizer,
  torch_dtype=torch.bfloat16,
  trust_remote_code=True,
  device_map='auto',
)

In [3]:
input_text = 'Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:'

sequences = pipeline(
  input_text,
  max_length=200,
  do_sample=True,
  top_k=10,
  num_return_sequences=1,
  eos_token_id=tokenizer.eos_token_id,
)

sequences

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


[{'generated_text': "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron: Hey, Daniel! How is it going?\nDaniel: It's going great. I'm doing a great job here in Animal Control.\nGirafatron: That's a great job, man! Keep up the good work!\nGirafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe."}]

In [4]:
from transformers.generation.streamers import BaseStreamer
import numpy as np

class Streamer(BaseStreamer):
  result = []

  def put(self, value):
    shape = value.shape
    if len(shape) == 2:
      self.result += [val for val in value.numpy()[0]]
    elif len(shape) == 1:
      self.result += [value.numpy()[0]]
    text = tokenizer.decode(np.array(self.result))
    print(text)

  def end(self):
    text = tokenizer.decode(np.array(self.result))
    print(text)


streamer = Streamer()
sequences = pipeline(
  input_text,
  max_length=1024,
  do_sample=True,
  top_k=10,
  num_return_sequences=1,
  eos_token_id=tokenizer.eos_token_id,
  streamer=streamer,
)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.
Daniel: Hello, Girafatron!
Girafatron:
Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.
Daniel: Hello, Girafatron!
Girafatron: Hello
Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.
Daniel: Hello, Girafatron!
Girafatron: Hello,
Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.
Daniel: Hello, Girafatron!
Girafatron: Hello, Daniel
Girafatron is obsessed with giraf