In [4]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "TheBloke/Mistral-7B-v0.1-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)

prompt = "Tell me about AI, and stuff:"
prompt_template=f'''{prompt}

'''

print("\n\n*** Generate:")

tokens = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens,
    do_sample=True,
    temperature=0.7,
)

print("Output: ", tokenizer.decode(generation_output[0]))

"""
# Inference should be possible with transformers pipeline as well in future
# But currently this is not yet supported by AutoAWQ (correct as of September 25th 2023)
from transformers import pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])
"""


Fetching 11 files: 100%|██████████| 11/11 [00:00<?, ?it/s]
Replacing layers...: 100%|██████████| 32/32 [00:02<00:00, 12.35it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




*** Generate:


RuntimeError: The size of tensor a (11) must match the size of tensor b (12) at non-singleton dimension 3

In [7]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

quant_path = "casperhansen/mistral-7b-instruct-v0.1-awq"

# Load model
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True).cuda()
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_special_tokens=True)

# Convert prompt to tokens
text = "<s>[INST] What is your favourite condiment? [/INST]"
"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> "
"[INST] Do you have mayonnaise recipes? [/INST]"

tokens = tokenizer(
    text, 
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens, 
    streamer=streamer,
    max_new_tokens=512
)


Fetching 10 files: 100%|██████████| 10/10 [00:00<?, ?it/s]
Replacing layers...: 100%|██████████| 32/32 [00:03<00:00,  9.26it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] What is your favourite condiment? [/INST] 

RuntimeError: The size of tensor a (16) must match the size of tensor b (17) at non-singleton dimension 3

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)





Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




*** Generate:


KeyboardInterrupt: 

In [None]:
prompt = "Tell me about AI"
prompt_template=f'''<s>[INST] {prompt} [/INST]
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="gptq-4bit-32g-actorder_True").cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

  from .autonotebook import tqdm as notebook_tqdm
CUDA extension not installed.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
prompt = "Tell me about AI"
prompt_template=f'''<s>[INST] {prompt} [/INST]
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])



*** Generate:




<s><s>[INST] Tell me about AI [/INST]

Artificial Intelligence (AI) refers to the ability of machines to perform tasks that would normally require human intelligence to complete. These tasks can include things like understanding natural language, recognizing patterns, and making decisions based on data.

AI can be divided into two main categories: narrow or weak AI and general or strong AI. Narrow AI is designed to perform a specific task and is limited in its ability to learn or adapt. General AI, on the other hand, is designed to be able to perform a wide range of tasks and is capable of learning and adapting as it goes.

AI is used in many different fields, including healthcare, finance, transportation, and education. It has the potential to revolutionize the way we live and work, but it also raises important ethical and social questions about the role of machines in our lives.

Overall, AI is an exciting and rapidly evolving field with many potential benefits, but it also presents 

KeyboardInterrupt: 

In [3]:
print(pipe(prompt_template)[0]['generated_text'])

KeyboardInterrupt: 