In [1]:
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  # "trl==0.7.10" # \
  # "peft==0.7.1" \

# install peft & trl from github
!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting evaluate==0.4.1
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting bitsandbytes==0.42.0
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.36.2)
  Downloading huggingface_hub-0.21.3-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.36.2)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
model_path = "meta-llama/Llama-2-7b-chat-hf"
peft_path = "jhlim8/ListenerLM"

# needed for access to llama 2, can just directly apply here https://huggingface.co/meta-llama/Llama-2-7b
# and then get the token from https://huggingface.co/settings/tokens
huggingface_token = '' 

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant = True
)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    device_map={'':'cuda:0'}, # have to specifically set each layer to device 0 when training with single gpu (sus i know)
    torch_dtype=torch.bfloat16, 
    token=huggingface_token
)
model = PeftModel.from_pretrained(model, peft_path, is_trainable=False, token=huggingface_token)
model.to("cuda:0") 
model.eval()
tok = AutoTokenizer.from_pretrained(peft_path, token=huggingface_token)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [None]:
input_text = input("Enter your message: ")
msg = [{"role":"user", "content":input_text}]

while True:
  encmessages = tok.apply_chat_template(msg, return_tensors="pt", tokenize=True).to('cuda:0')
  #print(msg)
  generated_ids = model.generate(input_ids=encmessages, max_new_tokens=500, do_sample=False, use_cache=True)
  decoded = tok.batch_decode(generated_ids)
  output = decoded[0][:-4].split('[/INST] ')[-1]
  msg.append({"role":"assistant", "content": output})
  print(output)
  next_input = input()
  msg.append({"role":"user", "content": next_input})
  if next_input == 'exit':
    break