# Mistral-7b inference

In [None]:
text = "<s>[INST] What is your favourite condiment? [/INST]"
"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> "
"[INST] Do you have mayonnaise recipes? [/INST]"


In [None]:
import torch
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
# base_model_id = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)


In [None]:
messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
model_inputs = encodeds.to("cuda")

generated_ids = model.generate(model_inputs, max_new_tokens=256, do_sample=False)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

## chatbot

In [1]:
import torch
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import subprocess
import os
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
import speech_recognition as sr
from IPython.display import display, clear_output

base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
# base_model_id = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)
# disables training
model.eval() 
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

r = sr.Recognizer()

def stt():
    text = ''
    with sr.Microphone() as source:
        clear_output(wait=True)
        r.adjust_for_ambient_noise(source)
        print("Speak something:")
        audio = r.listen(source)

    try:
        text = r.recognize_google(audio)

    except sr.UnknownValueError:
        print("Could not understand audio")
    except sr.RequestError as e:
        print("Error with the service: {0}".format(e))
    
    return text

def tts(text):
    command = f'echo "{text}" | festival --tts 2>/dev/null'
    subprocess.call(command, shell=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [None]:
messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
model_inputs = encodeds.to("cuda")

generated_ids = model.generate(model_inputs, max_new_tokens=256, do_sample=False)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

## speaking to model

In [None]:
context = []
question = ''

while (question != 'shut down'):
    question = stt()
    context.append({'role':'user','content':question})
    print(f'lj: {question}')
    if question == 'shut down':
        break

    # tokenize -> inference -> decode
    model_input = tokenizer.apply_chat_template(context, return_tensors="pt").to('cuda')
    generated_ids = model.generate(model_input, max_new_tokens=256, do_sample=False)
    raw_resp = tokenizer.batch_decode(generated_ids)[0]

    # if it followed the formatting, remove everything before the response, remove unrecognized characters
    i_clip = raw_resp.rfind("[/INST]")
    if i_clip != -1:
        raw_resp = raw_resp[i_clip + len("[/INST]"):]
    i_end = raw_resp.rfind("</s>")
    if i_end != -1:
        raw_resp = raw_resp[:i_end]
    resp = re.sub(r'[^\x00-\x7F]+', '', raw_resp)
    context.append({'role':'assistant','content':resp})
    print(f'misT: {resp}')
    tts(resp)

## typing to model

In [None]:
context = []
question = ''

while (question != 'shut down'):
    question = input('question: ')
    context.append({'role':'user','content':question})
    print(f'lj: {question}')
    if question == 'shut down':
        break
    # tokenize -> inference -> decode
    model_input = tokenizer.apply_chat_template(context, return_tensors="pt").to('cuda')
    generated_ids = model.generate(model_input, max_new_tokens=256, do_sample=False)
    raw_resp = tokenizer.batch_decode(generated_ids)[0]

    # if it followed the formatting, remove everything before the response, remove unrecognized characters
    i_clip = raw_resp.rfind("[/INST]")
    if i_clip != -1:
        raw_resp = raw_resp[i_clip + len("[/INST]"):]
    i_end = raw_resp.rfind("</s>")
    if i_end != -1:
        raw_resp = raw_resp[:i_end]
    resp = re.sub(r'[^\x00-\x7F]+', '', raw_resp)
    context.append({'role':'assistant','content':resp})
    print(f'misT: {resp}')

# fine tuning

### data prep

In [1]:
import torch
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import pandas as pd
import ast
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
from accelerate import FullyShardedDataParallelPlugin, Accelerator
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
# base_model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    padding='max_length',
    max_length=512,
    truncation=True
)
tokenizer.pad_token = tokenizer.eos_token

In [2]:
messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
tokenizer.decode(encodeds[0])


"<s> [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s>  [INST] Do you have mayonnaise recipes? [/INST]"

In [3]:
dset = pd.read_csv('ns_qa_p1_200.csv')

def clean_strings(answer_list):
    return [sentence.replace("\\", "") for sentence in ast.literal_eval(answer_list)]

dset['answers'] = dset['answers'].apply(clean_strings)

print(dset['question'][0], dset['answers'][0])

dset['pretoken_input'] = None

for j in range(len(dset['question'])):
    message = [{"role": "user", "content": dset['question'][j]}]
    for i, resp in enumerate(dset['answers'][j]):
        # alternate between "user" and "assistant" -> "op" and "other"
        if i % 2 == 0:
            message.append({"role": "assistant", "content": resp})
        else:
            message.append({"role": "user", "content": resp})
            
    dset.at[j, 'pretoken_input'] = message

how do you get to Chad's Gap from the bottom of Alta? ['I heard ski patrol blew it up', 'my ankles are broken']


In [4]:
hf_dset = Dataset.from_pandas(dset)
hf_dset = hf_dset.add_column('formatted_pretoken_input', [tokenizer.apply_chat_template(msg, tokenize=False) for msg in hf_dset['pretoken_input']])

hf_dset['formatted_pretoken_input'][0]

"<s>[INST] how do you get to Chad's Gap from the bottom of Alta? [/INST]I heard ski patrol blew it up</s> [INST] my ankles are broken [/INST]"

In [5]:
hf_dset = Dataset.from_pandas(dset)
hf_dset = hf_dset.add_column('formatted_pretoken_input', [tokenizer.apply_chat_template(msg, tokenize=False) for msg in hf_dset['pretoken_input']])
# hf_dset = hf_dset.add_column('tokenized_input', [tokenizer.encode(msg, padding='max_length', max_length=512, truncation=True) for msg in hf_dset['formatted_pretoken_input']]) # SFTTrainer won't let me token inputs myself
hf_dset = hf_dset.select_columns('formatted_pretoken_input')
hf_dset = hf_dset.train_test_split(test_size=0.1)
# hf_dset.save_to_disk('./hf_dset') # commented to prevent overwrite
hf_dset

Saving the dataset (0/1 shards):   0%|          | 0/7128 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/793 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['formatted_pretoken_input'],
        num_rows: 7128
    })
    test: Dataset({
        features: ['formatted_pretoken_input'],
        num_rows: 793
    })
})

In [6]:
hf_dset['train']['formatted_pretoken_input'][2] # first converation, first message of conversation

"<s>[INST] Update To Trying to Start a Rail Jam: So far i have gotten a confirmation from Honey Wax (@ridehoneywax) and support from ski addiction (ski_addiction). Also got advice from Mathew (@midwest_skier). As of now i’m just looking for one more sponsor before i propose the plan to my local hill. I have an outline started but still needs some work. thank you to all who’ve helped and please reach out if there is anymore advice or contacts of people i should reach out to. [/INST]Where is this at? I'd be interested in helping in the ways that I can. Getting sponsors is an important part.</s> [INST] this is at little switzerland slinger Wi. it is a tiny local hill but all our riders would love to see an event hosted. [/INST]**This post was edited on Nov 11th 2022 at 11:34:16am</s> [INST] Reach out to as many snowboard/skate shops as you can [/INST]Some local businesses typically sponsor the smaller events in my area, even if they aren’t snow sport related</s> [INST] **This post was edi

In [5]:
from datasets import load_from_disk

hf_dset = load_from_disk('./hf_dset/')
hf_dset['train']['formatted_pretoken_input'][2]


"<s>[INST] Update To Trying to Start a Rail Jam: So far i have gotten a confirmation from Honey Wax (@ridehoneywax) and support from ski addiction (ski_addiction). Also got advice from Mathew (@midwest_skier). As of now i’m just looking for one more sponsor before i propose the plan to my local hill. I have an outline started but still needs some work. thank you to all who’ve helped and please reach out if there is anymore advice or contacts of people i should reach out to. [/INST]Where is this at? I'd be interested in helping in the ways that I can. Getting sponsors is an important part.</s> [INST] this is at little switzerland slinger Wi. it is a tiny local hill but all our riders would love to see an event hosted. [/INST]**This post was edited on Nov 11th 2022 at 11:34:16am</s> [INST] Reach out to as many snowboard/skate shops as you can [/INST]Some local businesses typically sponsor the smaller events in my area, even if they aren’t snow sport related</s> [INST] **This post was edi

### model prep

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
)

# auto maps to GPUs shouldn't matter since I only have 1, trust_remote_code is for custom defined models to pull from HF hub
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, trust_remote_code=True)


In [None]:
# Check the number of available GPUs
num_gpus = torch.cuda.device_count()

# List the available GPUs
gpu_list = [torch.cuda.get_device_name(i) for i in range(num_gpus)]

# Print GPU memory information
for i, gpu in enumerate(gpu_list):
    device = torch.device(f"cuda:{i}")
    mem_info = torch.cuda.get_device_properties(device)
    print(f"GPU {i} ({gpu}):")
    print(f"  Total Memory: {mem_info.total_memory / (1024**3):.1f} GB")
    print(f"  Free Memory: {torch.cuda.max_memory_allocated(i) / (1024**3):.1f} GB")
    print(f"  Allocated Memory: {torch.cuda.memory_allocated(i) / (1024**3):.1f} GB")
    print(f"  Cached Memory: {torch.cuda.memory_reserved(i) / (1024**3):.1f} GB")
    print(model.device)
    print()


In [None]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
model.device

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [26]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 0 || all params: 3752071168 || trainable%: 0.0


In [27]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

In [None]:
# scaling factor for learned weights is alpha/r and bigger rank = more computation tradeoff. paper did 64/16=8, we'll try 16/2=8
config = LoraConfig(
    r=2,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

In [None]:

if torch.cuda.device_count() > 1: # If more than 1 GPU
    print("More than one GPU")
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
training_arguments = TrainingArguments(
        output_dir="./model_ft",
        warmup_steps=2,
        per_device_train_batch_size=1, # reduce if OOM by 2x
        gradient_accumulation_steps=1, # x 2x if batch size is reduced
        max_steps=100,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=10,
        logging_dir="./logs", # Directory for storing logs
        fp16 = False,
        bf16 = True, # With an A100
        optim="paged_adamw_8bit", 
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=25,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
    )

# not using
# max_grad_norm = 0.3 # maximum gradient norm (for gradient clipping)
# report_to="wandb",           # Weights & baises analytics
# run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run
# warmup_ratio = 0.03 # number of steps used for a linear warmup from 0 to learning_rate
# lr_scheduler_type = "cosine"  # learning rate scheduler

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=hf_dset['train'],
    eval_dataset=hf_dset['test'],
    dataset_text_field='formatted_pretoken_input',
    data_collator=None,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.to(accelerator.device)

trainer.train()

In [32]:
import torch
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import pandas as pd
import ast
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
from accelerate import FullyShardedDataParallelPlugin, Accelerator
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

base_model_id='mistralai/Mistral-7B-Instruct-v0.1'
PEFT_MODEL = './model_ft/checkpoint-100'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, trust_remote_code=True)
model = PeftModel.from_pretrained(model, PEFT_MODEL)
model = model.merge_and_unload()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    padding='max_length',
    max_length=512,
    truncation=True
)
tokenizer.pad_token = tokenizer.eos_token

In [33]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
from peft import PeftModel, PeftConfig

base_model_id='mistralai/Mistral-7B-Instruct-v0.1'
PEFT_MODEL = './model_ft/checkpoint-100'

model = AutoModelForCausalLM.from_pretrained(base_model_id, load_in_4bit=True)

model = PeftModel.from_pretrained(model, PEFT_MODEL)

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=200,
)

prompt = "<s>[INST] how do you get to chads gap from the bottom of alta? [/INST]"

print(pipe(prompt)[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

who is tanner hall?
User 1: Tanner Hall is a character in the TV show "The Office" played by actor Dwight Schrute. He is the Assistant Regional Manager of the Dunder Mifflin Paper Company branch in Scranton, Pennsylvania. He is known for his obsession with beets, his love of the show "The Office" and his desire to become the Regional Manager.

Tanner Hall is portrayed as a quirky and eccentric character who is often misunderstood by his coworkers. He is also known for his love of beets and his desire to become the Regional Manager. He is often seen eating beets and talking about them, much to the amusement of his coworkers.

Tanner Hall is a fan favorite and is often praised for his portrayal in the show. He is known for his unique personality and his love of beets.


In [21]:
model.eval()
with torch.no_grad():
    text_input = "<s>[INST] how do you get to Chad's Gap from the bottom of Alta? [/INST]"
    tokenized_input = tokenizer(text_input, return_tensors="pt")
    input_ids = tokenized_input['input_ids']
    output_ids = model.generate(input_ids, max_new_tokens=512)
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


[INST] how do you get to Chad's Gap from the bottom of Alta? [/INST]


In [31]:
model.eval()
text_input = "<s>[INST] how do you get to Chad's Gap from the bottom of Alta? [/INST]"
model_input = tokenizer.encode(text_input, return_tensors="pt").to('cuda')
generated_ids = model.generate(model_input, max_new_tokens=256, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"<s><s> [INST] how do you get to Chad's Gap from the bottom of Alta? [/INST]</s></s>"