In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
import torch

device = "cpu"
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
    print("Driver Memory = ", torch.mps.driver_allocated_memory())
    print("Current allocated memory = ", torch.mps.current_allocated_memory())
    mps_device = torch.device("mps")
    device = mps_device
    print("MPS available on mac. Using it")


Driver Memory =  393216
Current allocated memory =  0
MPS available on mac. Using it


In [3]:
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", offload_folder ="offload-tinyllama").to(device)
model.half()
print(model.device, model.dtype)

In [13]:
prompt = """<|system|>
You are a chatbot who can help answer queries!</s>
<|user|>
What is the capital of India?</s>
<|assistant|>
"""

inputs = tokenizer(prompt, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[    1,   529, 29989,  5205, 29989, 29958,    13,  3492,   526,   263,
         13563,  7451,  1058,   508,  1371,  1234,  9365, 29991,     2, 29871,
            13, 29966, 29989,  1792, 29989, 29958,    13,  5618,   338,   278,
          7483,   310,  7513, 29973,     2, 29871,    13, 29966, 29989,   465,
         22137, 29989, 29958,    13]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='mps:0')}

In [2]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        print(input_ids)
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False


tokens = model.generate(
  **inputs,
  max_new_tokens=2,
  temperature=0.01,
  do_sample=True,
  # stopping_criteria=StoppingCriteriaList([StopOnTokens()])
)

tokenizer.decode(tokens[0])

NameError: name 'StoppingCriteria' is not defined

In [13]:
tokenizer('am Krishan', return_tensors="pt")

{'input_ids': tensor([[  312, 41918,   266]]), 'attention_mask': tensor([[1, 1, 1]])}

In [3]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt


NameError: name 'tokenizer' is not defined

In [9]:
tokenizer('<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s>\n<|user|>\nHow many helicopters can a human eat in one sitting?</s>\n<|assistant|>\n')

{'input_ids': [1, 529, 29989, 5205, 29989, 29958, 13, 3492, 526], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
tokenizer('hello\n<|user|>')

{'input_ids': [1, 22172, 13, 29966, 29989, 1792, 29989, 29958], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [1]:
tokenizer.decode([1, 29966, 29989, 1792, 29989, 29958])

NameError: name 'tokenizer' is not defined

In [16]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '</s>'}

In [18]:
tokenizer('</s>')

{'input_ids': [1, 2], 'attention_mask': [1, 1]}