In [1]:
%set_env PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync

import importer
from exllama.lora import ExLlamaLora
from exllama.tokenizer import ExLlamaTokenizer

# from exllama_extras.model_loaders import ooba_loader
# from exllama_extras import model_init

env: PYTORCH_CUDA_ALLOC_CONF=backend:cudaMallocAsync
/home/jovyan/work/importer.py


In [3]:
## Taken from https://github.com/jllllll/exllama/blob/9c90bbebf3d4dff7c2aa35218552ad935ab91339/example_basic.py

from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
from exllama.tokenizer import ExLlamaTokenizer
from exllama.generator import ExLlamaGenerator
import os, glob

# Directory containing model, tokenizer, generator

model_directory = "models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ"

# Locate files we need within that directory

tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]

# Create config, model, tokenizer and generator

config = ExLlamaConfig(model_config_path)  # create config from config.json
config.model_path = model_path  # supply path to model weights file

model = ExLlama(config)  # create ExLlama instance and load the weights
tokenizer = ExLlamaTokenizer(
    tokenizer_path
)  # create tokenizer from tokenizer model file

cache = ExLlamaCache(model)  # create cache for inference
generator = ExLlamaGenerator(model, tokenizer, cache)  # create generator

# Configure generator

generator.disallow_tokens([tokenizer.eos_token_id])

generator.settings.token_repetition_penalty_max = 1.2
generator.settings.temperature = 0.95
generator.settings.top_p = 0.65
generator.settings.top_k = 100
generator.settings.typical = 0.5

# Produce a simple generation

prompt = "Once upon a time,"
print(prompt, end="")

output = generator.generate_simple(prompt, max_new_tokens=200)

print(output[len(prompt) :])

Once upon a time, there was an island in the middle of nowhere. It had been abandoned for years and nobody knew why it existed or what secrets lay hidden within its walls.
Verse 1:
On this deserted isle so far from home
Lies a mystery waiting to be known
No one knows how long it's stood here alone
But something about it draws us close like stone

Chorus:
We come seeking answers to questions unasked
Hoping that our journey won't leave us masked
For on this forgotten land we may find
The truth behind a story left undefined

Bridge:
As we explore every corner and crevice
Our hearts race with anticipation and persistence
Could it be possible that legends are true?
That magic exists just beyond view

Chorus:
We come seeking answers to questions unasked
Hoping that our journey won't leave us masked


In [1]:
from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
from exllama.tokenizer import ExLlamaTokenizer
from exllama.generator import ExLlamaGenerator
import os, glob
import torch

# Directory containing model, tokenizer, generator

model_directory = "models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ"

# Locate files we need within that directory

tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]

# Batched prompts

prompts = [
    "Once upon a time,",
    "I don't like to",
    "A turbo encabulator is a",
    "In the words of Mark Twain,",
]

# Create config, model, tokenizer and generator

config = ExLlamaConfig(model_config_path)  # create config from config.json
config.model_path = model_path  # supply path to model weights file

with torch.no_grad():
    model = ExLlama(config)  # create ExLlama instance and load the weights
    tokenizer = ExLlamaTokenizer(
        tokenizer_path
    )  # create tokenizer from tokenizer model file

    cache = ExLlamaCache(model, batch_size=len(prompts))  # create cache for inference
    generator = ExLlamaGenerator(model, tokenizer, cache)  # create generator

    # Configure generator

    generator.disallow_tokens([tokenizer.eos_token_id])

    generator.settings.token_repetition_penalty_max = 1.2
    generator.settings.temperature = 0.95
    generator.settings.top_p = 0.65
    generator.settings.top_k = 100
    generator.settings.typical = 0.5

    # Generate, batched

    for line in prompts:
        print(line)

    output = generator.generate_simple(prompts, max_new_tokens=200)

    for line in output:
        print("---")
        print(line)

Once upon a time,
I don't like to
A turbo encabulator is a
In the words of Mark Twain,
---
Once upon a time, in the land of Aire, there was an evil sorcerer named Malacar. He had long been known for his wicked ways and dark magic, but no one dared to stand up against him because he possessed immense power. One day, however, fate intervened when a young heroine by the name of Sage stumbled into town. She came from far away lands where she learned about ancient magic that could rival even Malacar's own powers. Armed with her knowledge and determination, Sage set out on a quest to defeat the evil sorcerer once and for all. But as she delved deeper into the mysteries of magic, she realized that defeating Malacar would not be easy - it would require all of her strength, courage, and cunning. With each step closer to victory, Sage grew more determined than ever before. Will she succeed? Or will Malacar prove too powerful for her to overcome? Only time
---
I don't like to get my hands dirty.


In [1]:
from dataclasses import dataclass, field
from typing import Optional
import importer
from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
from exllama.lora import ExLlamaLora
from exllama.tokenizer import ExLlamaTokenizer
from exllama.generator import ExLlamaGenerator

# from exllama_extras import model_init
import argparse
import torch
import sys
import os
import glob
import torch
import gc

# Simple interactive chatbot script

torch.set_grad_enabled(False)
torch.cuda._lazy_init()

model_directory = "models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ"

# Parse arguments

parser = argparse.ArgumentParser(description="Simple chatbot example for ExLlama")

# model_init.add_args(parser)


@dataclass
class Args:
    username: str
    botname: str
    tokenizer: str
    config: str
    model: str
    repetition_penalty: float
    repetition_penalty_sustain: int
    repetition_penalty_decay: int
    fused_mlp_thd: Optional[int] = 0
    sdp_thd: Optional[int] = 0
    matmul_fused_remap: Optional[int] = 0
    compress_pos_emb: Optional[float] = 1
    matmul_recons_thd: Optional[int] = 0
    lora_dir: Optional[str] = None
    lora_config: Optional[str] = None
    lora: Optional[str] = None
    length: Optional[int] = None
    temperature: Optional[float] = None
    top_k: Optional[int] = None
    top_p: Optional[float] = None
    min_p: Optional[float] = None
    beams: Optional[int] = None
    beam_length: Optional[int] = None
    no_newline: Optional[bool] = None
    botfirst: Optional[bool] = None
    gpu: Optional[int] = None
    gpu_split: Optional[int] = None
    gpu_peer_fix: Optional[bool] = None
    gpu_allow_growth: Optional[bool] = None
    gpu_memory_fraction: Optional[float] = None
    gpu_memory_growth: Optional[bool] = None
    gpu_memory_allow_growth: Optional[bool] = None
    gpu_memory_limit: Optional[int] = None
    gpu_memory_max_usage: Optional[float] = None
    gpu_memory_min_usage: Optional[float] = None
    gpu_memory_soft_limit: Optional[int] = None
    gpu_memory_hard_limit: Optional[int] = None
    prompt: Optional[str] = None


args = Args(
    username="User",
    botname="Bot",
    temperature=0.95,
    top_k=100,
    top_p=0.65,
    min_p=0.0,
    tokenizer=f"{model_directory}/tokenizer.model",
    config=f"{model_directory}/config.json",
    model=model_directory,
    beam_length=3,
    beams=1,
    repetition_penalty=1.2,
    repetition_penalty_sustain=1,
    repetition_penalty_decay=1,
)


# Paths

if args.lora_dir is not None:
    args.lora_config = os.path.join(args.lora_dir, "adapter_config.json")
    args.lora = os.path.join(args.lora_dir, "adapter_model.bin")

# Some feedback

print(f" -- Sequence length: {args.length}")
print(f" -- Temperature: {args.temperature:.2f}")
print(f" -- Top-K: {args.top_k}")
print(f" -- Top-P: {args.top_p or 0:.2f}")
print(f" -- Min-P: {args.min_p or 0:.2f}")
print(f" -- Repetition penalty: {args.repetition_penalty or 0:.2f}")
print(f" -- Beams: {args.beams} x {args.beam_length}")

print_opts = []
if args.no_newline:
    print_opts.append("no_newline")
if args.botfirst:
    print_opts.append("botfirst")

# model_init.print_options(args, print_opts)

# Load prompt file

username = args.username
bot_name = args.botname

if args.prompt is not None:
    with open(args.prompt, "r") as f:
        past = f.read()
        past = past.replace("{username}", username)
        past = past.replace("{bot_name}", bot_name)
        past = past.strip() + "\n"
else:
    past = f"{bot_name}: Hello, {username}\n"

# past += "User: Hi. Please say \"Shhhhhh\"?\n"
# args.botfirst = True

# Instantiate model and generator

# config = model_init.make_config(args)


model_directory = "models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ"

# Locate files we need within that directory

tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]

config = ExLlamaConfig(model_config_path)  # create config from config.json
config.model_path = model_path  # supply path to model weights file

with torch.no_grad():
    try:
        model = ExLlama(config)
        cache = ExLlamaCache(model)
        tokenizer = ExLlamaTokenizer(args.tokenizer)

        # model_init.print_stats(model)

        # Load LoRA

        lora = None
        if args.lora:
            print(f" -- LoRA config: {args.lora_config}")
            print(f" -- Loading LoRA: {args.lora}")
            if args.lora_config is None:
                print(f" ## Error: please specify lora path to adapter_config.json")
                sys.exit()
            lora = ExLlamaLora(model, args.lora_config, args.lora)
            if lora.bias_ignored:
                print(f" !! Warning: LoRA zero bias ignored")

        # Generator

        generator = ExLlamaGenerator(model, tokenizer, cache)
        generator.settings = ExLlamaGenerator.Settings()
        generator.settings.temperature = args.temperature
        generator.settings.top_k = args.top_k
        generator.settings.top_p = args.top_p
        generator.settings.min_p = args.min_p
        generator.settings.token_repetition_penalty_max = args.repetition_penalty
        generator.settings.token_repetition_penalty_sustain = (
            args.repetition_penalty_sustain
        )
        generator.settings.token_repetition_penalty_decay = (
            generator.settings.token_repetition_penalty_sustain or 1
        ) // 2
        generator.settings.beams = args.beams
        generator.settings.beam_length = args.beam_length

        generator.lora = lora

        break_on_newline = not args.no_newline

        # Be nice to Chatbort

        min_response_tokens = 4
        max_response_tokens = 256
        torch.cuda.empty_cache()
        extra_prune = 256

        print(past, end="")
        ids = tokenizer.encode(past)
        generator.gen_begin(ids)

        next_userprompt = username + ": "

        first_round = True

        while True:
            res_line = bot_name + ":"
            res_tokens = tokenizer.encode(res_line)
            num_res_tokens = res_tokens.shape[-1]  # Decode from here

            if first_round and args.botfirst:
                in_tokens = res_tokens

            else:
                # Read and format input

                in_line = input(next_userprompt)

                if in_line.strip() == "":
                    break

                in_line = username + ": " + in_line.strip() + "\n"

                next_userprompt = username + ": "

                # No need for this, really, unlesczs we were logging the chat. The actual history we work on is kept in the
                # tokenized sequence in the generator and the state in the cache.

                past += in_line

                # SentencePiece doesn't tokenize spaces separately so we can't know from individual tokens if they start a new word
                # or not. Instead, repeatedly decode the generated response as it's being built, starting from the last newline,
                # and print out the differences between consecutive decodings to stream out the response.

                in_tokens = tokenizer.encode(in_line)
                in_tokens = torch.cat((in_tokens, res_tokens), dim=1)

            # If we're approaching the context limit, prune some whole lines from the start of the context. Also prune a
            # little extra so we don't end up rebuilding the cache on every line when up against the limit.

            expect_tokens = in_tokens.shape[-1] + max_response_tokens
            max_tokens = config.max_seq_len - expect_tokens
            if generator.gen_num_tokens() >= max_tokens:
                generator.gen_prune_to(
                    config.max_seq_len - expect_tokens - extra_prune,
                    tokenizer.newline_token_id,
                )

            # Feed in the user input and "{bot_name}:", tokenized

            generator.gen_feed_tokens(in_tokens)

            # Generate with streaming

            print(res_line, end="")

            generator.begin_beam_search()

            for i in range(max_response_tokens):
                # Disallowing the end condition tokens seems like a clean way to force longer replies.

                if i < min_response_tokens:
                    generator.disallow_tokens(
                        [tokenizer.newline_token_id, tokenizer.eos_token_id]
                    )
                else:
                    generator.disallow_tokens(None)

                # Get a token

                gen_token = generator.beam_search()

                # If token is EOS, replace it with newline before continuing

                if gen_token.item() == tokenizer.eos_token_id:
                    generator.replace_last_token(tokenizer.newline_token_id)

                # Decode the current line and print any characters added

                num_res_tokens += 1
                text = tokenizer.decode(
                    generator.sequence_actual[:, -num_res_tokens:][0]
                )
                new_text = text[len(res_line) :]

                skip_space = res_line.endswith("\n") and new_text.startswith(
                    " "
                )  # Bit prettier console output
                res_line += new_text
                if skip_space:
                    new_text = new_text[1:]

                print(new_text, end="")  # (character streaming output is here)

                # End conditions

                if break_on_newline and gen_token.item() == tokenizer.newline_token_id:
                    break
                if gen_token.item() == tokenizer.eos_token_id:
                    break

                # Some models will not (or will inconsistently) emit EOS tokens but in a chat sequence will often begin
                # generating for the user instead. Try to catch this and roll back a few tokens to begin the user round.

                if res_line.endswith(f"{username}:"):
                    plen = tokenizer.encode(f"{username}:").shape[-1]
                    generator.gen_rewind(plen)
                    next_userprompt = " "
                    break

            generator.end_beam_search()

            past += res_line
            first_round = False
    finally:
        torch.cuda.empty_cache()
        del model
        del cache
        del tokenizer
        gc.collect()

/home/jovyan/work/importer.py
 -- Sequence length: None
 -- Temperature: 0.95
 -- Top-K: 100
 -- Top-P: 0.65
 -- Min-P: 0.00
 -- Repetition penalty: 1.20
 -- Beams: 1 x 3
Bot: Hello, User
Bot: How can I assist you today?
Bot: Sure, what would you like to know about Justin Bieber?
Bot: The Pittsburgh Steelers won the Super Bowl the year Justin Bieber was born.
Bot: Justin Bieber was born on March 1, 199.
Bot: March 1, 199.
Bot: Actually, Justin Bieber is currently 21 years old.
Bot: Justin Bieber has been performing since he was a child, and he has been able to continue performing as an adult due to his talent and hard work.
Bot: I'm sorry, but that is incorrect. Justin Bieber was actually born on March 1, 199.
Bot: Justin Bieber was born on March 1, 199.
Bot: Justin Bieber was born on March 1, 199.
Bot: Justin Bieber was born on March 1, 199.
Bot: Justin Bieber was born on March 1, 199.
Bot: Justin Bieber was born on March 1, 199.
Bot: Justin Bieber was born on March 1, 199.
Bot: Justi

In [2]:
from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
from exllama.tokenizer import ExLlamaTokenizer
from exllama.generator import ExLlamaGenerator
from exllama.lora import ExLlamaLora
import os, glob
import torch

# Directory containt model, tokenizer, generator

model_directory = "models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ"
# Directory containing LoRA config and weights

lora_directory = "models/sooolee_flan-t5-base-cnn-samsum-lora"

# Locate files we need within those directories

tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]

lora_config_path = os.path.join(lora_directory, "adapter_config.json")
lora_path = os.path.join(lora_directory, "adapter_model.bin")

# Create config, model, tokenizer and generator

config = ExLlamaConfig(model_config_path)  # create config from config.json
config.model_path = model_path  # supply path to model weights file

model = ExLlama(config)  # create ExLlama instance and load the weights
tokenizer = ExLlamaTokenizer(
    tokenizer_path
)  # create tokenizer from tokenizer model file

cache = ExLlamaCache(model)  # create cache for inference
generator = ExLlamaGenerator(model, tokenizer, cache)  # create generator

# Load LoRA

lora = ExLlamaLora(model, lora_config_path, lora_path)

# Configure generator

generator.settings.token_repetition_penalty_max = 1.2
generator.settings.temperature = 0.65
generator.settings.top_p = 0.4
generator.settings.top_k = 0
generator.settings.typical = 0.0

# Alpaca prompt

prompt = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
    "\n"
    "### Instruction:\n"
    "List five colors in alphabetical order.\n"
    "\n"
    "### Response:"
)

# Generate with LoRA

print(" --- LoRA ----------------- ")
print("")

generator.lora = lora
torch.manual_seed(1337)
output = generator.generate_simple(prompt, max_new_tokens=200)
print(output)

# Generate without LoRA

print("")
print(" --- No LoRA -------------- ")
print("")

generator.lora = None
torch.manual_seed(1337)
output = generator.generate_simple(prompt, max_new_tokens=200)
print(output)

ValueError:  ## Error: unsupported layer in models/sooolee_flan-t5-base-cnn-samsum-lora/adapter_model.bin: base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_A.weight