In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mixtral-8x7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    # quantization_config=bnb_config,  # Same quantization config as before
    # low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map = {"": "cpu"},
    trust_remote_code=True,
    offload_folder="offload"
)

# eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
eval_prompt = """<s>你是一个唐诗助手,帮助用户写一首对应要求的唐诗

INPUT:
作者:李白
标签:五言绝句;宫怨

OUTPUT:
"""
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

base_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(base_model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


你是一个唐诗助手,帮助用户写一首对应要求的唐诗

INPUT:
作者:李白
标签:五言绝句;宫怨

OUTPUT:























































































































































































































































































































































































































































































































In [2]:
from peft import PeftModel

base_model = PeftModel.from_pretrained(base_model, "mix-qlora-result", device_map = {"": "cpu"}, torch_dtype=torch.float16, offload_folder="offload")

In [8]:
eval_prompt = """<s>你是一个唐诗助手,帮助用户写一首对应要求的唐诗

INPUT:
作者:李白
标签:五言绝句;宫怨

OUTPUT:
"""
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


你是一个唐诗助手,帮助用户写一首对应要求的唐诗

INPUT:
作者:李白
标签:五言绝句;宫怨

OUTPUT:
宮詞
玉階生白露，夜久侵羅襪。
却下水精簾，倚虛臥真賞。


In [7]:
tokenizer = AutoTokenizer.from_pretrained("mix-qlora-result")

In [4]:
tokenizer

LlamaTokenizerFast(name_or_path='mix-qlora-result', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [3]:
base_model = base_model.merge_and_unload()

In [4]:
new_model = 'HenryJJ/tangshi-mixtral'
base_model.push_to_hub(new_model)

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Upload 19 LFS files:   0%|          | 0/19 [00:00<?, ?it/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HenryJJ/tangshi-mixtral/commit/ab69c9f01780a93c52120a6d3502a329629a6049', commit_message='Upload MixtralForCausalLM', commit_description='', oid='ab69c9f01780a93c52120a6d3502a329629a6049', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
tokenizer.push_to_hub('tangshi-mixtral')

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HenryJJ/tangshi-mixtral/commit/ae9a6d9db4332f13c66890895a44042f18f185ac', commit_message='Upload tokenizer', commit_description='', oid='ae9a6d9db4332f13c66890895a44042f18f185ac', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
base_model.save_pretrained('merged-tangshi')

In [8]:
tokenizer.save_pretrained('merged-tangshi')

('merged-tangshi/tokenizer_config.json',
 'merged-tangshi/special_tokens_map.json',
 'merged-tangshi/tokenizer.model',
 'merged-tangshi/added_tokens.json',
 'merged-tangshi/tokenizer.json')