In [1]:
import os
import platform
from huggingface_hub import login
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig

In [5]:
# prevent env load failed
%load_ext dotenv
%dotenv

In [2]:
login(token=os.environ.get("HF_TOKEN", ""), add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/hermeschen/.cache/huggingface/token
Login successful


In [3]:
test_data = load_dataset("json", data_files={"test": "../data/daily_dialog_test.json"}, trust_remote_code=True)

FileNotFoundError: Unable to find '/Users/hermeschen/Repo/chat-bot/bot/response_generator/../data/daily_dialog_test.json'

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", trust_remote_code=True)

In [6]:
test_data = test_data.map(lambda samples: {
	"prompt": [f'<S>[INST] ### user({sample['emotion']}): {sample['response']} ### bot({sample['emotion_next']}):  [/INST]' for sample in samples]
}, batched=True)

In [7]:
test_data = test_data.remove_columns(["id", "emotion", "utterance"])

In [4]:
# Model
quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16
)
prompt_type = "script"
model = AutoModelForCausalLM.from_pretrained(f"response_generator_llama_on_daily_dialog_type_{prompt_type}", low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [6]:
model = torch.compile(model)

In [5]:
generation_config = GenerationConfig(
	max_new_tokens=20,
	min_new_tokens=5,
	repetition_penalty=1.5
)

In [7]:
sample = "<s>[INST]user(angry): I hate you!\nbot(sadness): [/INST]"

In [None]:
inputs = tokenizer(sample, return_tensors="pt")
response_ids = model.generate(**inputs, generation_config=generation_config)
response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [10]:
response = []

In [11]:
for batch in test_data:
	batch_response = []
	for sample in batch["prompt"]:
		inputs = tokenizer(sample, return_tensors="pt").to("cuda")
		response_ids = model.generate(**inputs, generation_config=generation_config)
		response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
		batch_response.append(response_text.strip())
	response.append(batch_response)

KeyboardInterrupt: 

In [12]:
response

[["<S>[INST] ### user(neutral): Hey man, you wanna buy some weed?  ### bot(surprise):  [/INST]  0h my god! Are u serious. I can't believe it... You are s",
  "<S>[INST] ### user(surprise):  Some what?  ### bot(neutral):  [/INST]  10,523.47 yuan. That's the amount I have",
  '<S>[INST] ### user(neutral):  Weed! You know? Pot, Ganja, Mary Jane some chronic!  ### bot(neutral):  [/INST]  10 seconds later... " I don\'t think you should be talking about that. It',
  "<S>[INST] ### user(neutral):  Oh, umm, no thanks.  ### bot(neutral):  [/INST]  Come on! It's free and it will help you relax after your long flight.You",
  '<S>[INST] ### user(neutral):  I also have blow if you prefer to do a few lines.  ### bot(neutral):  [/INST]  10 cigarettes, please.Could i pay with my credit card?\n<',
  "<S>[INST] ### user(neutral):  No, I am ok, really.  ### bot(neutral):  [/INST]  Well then.Let's go and have a drink together this evening? It's been",
  '<S>[INST] ### user(neutral):  Come on man! I even g