**Load saved model**

In [None]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import gc

try:
    del model
    del tokenizer
except NameError:
    pass
gc.collect()
torch.cuda.empty_cache()

max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "test_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
tokenizer.pad_token = '<|reserved_special_token_250|>'
tokenizer.pad_token_id = 128255


In [None]:
min_tokens = 64
max_tokens = 1024
instructions = "You are a helpful AI assistant."

**Generate response**

In [None]:
prompt = ""

msgs = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": prompt}
]

FastLanguageModel.for_inference(model)
inputs = tokenizer.apply_chat_template(
    msgs,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt"
).to("cuda")
attention_mask = (inputs != tokenizer.pad_token_id).int()

outputs = model.generate(
    input_ids = inputs, 
    attention_mask = attention_mask, 
    min_new_tokens = min_tokens, 
    max_new_tokens = max_tokens,
    temperature = 0.8,
    repetition_penalty=1.25,
    top_p = 0.9,
    use_cache = True
)

decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
decoded_input = tokenizer.batch_decode(inputs, skip_special_tokens=True)[0]
generated_response = decoded_output[len(decoded_input):].strip()
# print(tokenizer.batch_decode(outputs)[0])
# print()
print(generated_response)

**Generate response token by token**

In [None]:
prompt = ""

msgs = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": prompt}
]

inputs = tokenizer.apply_chat_template(
    msgs,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt"
).to("cuda")
attention_mask = (inputs != tokenizer.pad_token_id).int()

text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
outputs = model.generate(
    input_ids = inputs, 
    attention_mask = attention_mask, 
    streamer = text_streamer, 
    min_new_tokens = min_tokens, 
    max_new_tokens = max_tokens,
    temperature = 0.8,
    repetition_penalty = 1.25,
    top_p = 0.9,
    use_cache = True
)