In [1]:
%reload_ext autoreload
%autoreload 2

## Example Interacting With The Service

In [38]:
from server.deepsparse.deepsparse_causal_lm import DeepSparseCausalLMBatch, DeepSparseCausalLM
from server.deepsparse.deepsparse_service import DeepSparseService
from server.deepsparse.deepsparse_requests import (
    PrefillRequest, DecodeRequest, FilterBatchRequest, Request
)

In [3]:
tokenizer_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment"
onnx_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx"

model = DeepSparseCausalLM(
    tokenizer_path=tokenizer_path,
    model_path=onnx_path
)

service = DeepSparseService(model=model)

Using pad_token, but it is not set yet.
2023-08-22 03:09:19 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)


deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Scheduler.default
	fraction_of_supported_ops: 1.0
	cpu_avx_type: avx2
	cpu_vnni: False


2023-08-22 03:09:45 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx


deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Scheduler.default
	fraction_of_supported_ops: 1.0
	cpu_avx_type: avx2
	cpu_vnni: False


In [31]:
prompts = [
    "Finish the following function for computing a fibonacci sequence: \n\n fib(n):",
    "Write a function for filtering a list of integers to include only positive numbers:\n\nfilter(lst):",
    "Write a function for reversing a string:\n\ndef reverse_string(s):",
    "Write a function for checking if a word if a palindrome:\n\ndef is_palindrome(word):",
    "Write a function for sorting an array of integers:\n\ndef merge_sort(arr):",
]

def make_batch(id, prompt):
    return Batch(
        id=id,
        requests=[Request(id=id, prompt=prompt)]
    )

class PrefillQueue:
    def __init__(self, prompts):
        self.queue = {
            idx: PrefillRequest(batch=make_batch(id=idx, prompt=prompt))
            for idx, prompt in enumerate(prompts)
        }

    def pop(self):
        keys = list(self.queue.keys())
        if len(keys) == 0:
            return None
        else:
            return self.queue.pop(keys[0])

In [63]:
service.ClearCache()

# prefill queue
prefill_queue = PrefillQueue(prompts)

# cached batches
cached_batches = []

# generated
generated_text = {}

def prefill(request):
    generation, cached_batch = service.Prefill(request)
    
    assert request.batch.requests[0].id == generation.request_id
    assert generation.request_id not in generated_text.keys()
    
    generated_text[generation.request_id] = request.batch.requests[0].prompt + generation.generated_text

    return cached_batch

def decode(request):
    for cached_batch in request.batches:
        for request_id in cached_batch.request_ids:
            assert request_id in generated_text.keys()

    generations, cached_batch = service.Decode(request)
    if cached_batch is None:
        print("All requests done!\n\n")
        return None
    
    active_request_ids = []
    stopped_request_ids = []
    
    for generation in generations:
        assert generation.request_id in generated_text.keys()

        # if text is None, we stopped
        if generation.generated_text is None:
            print(f"Request {generation.request_id} is done!")
            stopped_request_ids.append(generation.request_id)
            
        else:
            generated_text[generation.request_id] += generation.generated_text
            active_request_ids.append(generation.request_id)
        
    # if any stopped, return this
    if len(stopped_request_ids) > 0:
        cached_batch = service.FilterBatch(FilterBatchRequest(
            batch_id=cached_batch.batch_id,
            request_ids=active_request_ids,
        ))
        
    return cached_batch

# run a prefille
queue_not_empty = True
while queue_not_empty:
    prefill_request = prefill_queue.pop()
    if prefill_request is not None:
        cached_batch = prefill(prefill_request)
        cached_batches.append(cached_batch)
    else:
        queue_not_empty = False
    
    # run a few decodes
    for _ in range(5):
        cached_batches = [decode(DecodeRequest(cached_batches))]

In [64]:
# run a few decodes
for _ in range(100):
    cached_batch = decode(DecodeRequest(cached_batches))
    if cached_batch is None:
        break
    cached_batches = [cached_batch]
    
for idx, value in generated_text.items():
    print(f"INDEX = {idx}:")
    print(value)
    print("\n")

print(cached_batches)

Request 0 is done!
Request 1 is done!
Request 3 is done!
Request 2 is done!
All Requests Done!


INDEX = 0:
Finish the following function for computing a fibonacci sequence: 

 fib(n):

    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fib(n-1) + fib(n-2)

# Call the function.
print(fib(5))

# This code is contributed by Nikhil Kumar Singh(nickzuck_007)



INDEX = 1:
Write a function for filtering a list of integers to include only positive numbers:

filter(lst):

lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

def filter_positive(lst):
    return [num for num in lst if num > 0]

print(filter_positive(lst))

# filter_positive([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# filter_positive([1, 2, 3, 4, 5


INDEX = 2:
Write a function for reversing a string:

def reverse_string(s):
    return s[::-1]

# Test
print(reverse_string("hello"))
print(reverse_string(""))
print(reverse_string("a"))
print(reverse_string(""))
print(reverse_string(""))
print(reverse_string(""))


## Example DeepSparseCausalLMBatch

In [1]:
from server.deepsparse.deepsparse_causal_lm import DeepSparseCausalLMBatch, DeepSparseCausalLM
from server.deepsparse.deepsparse_requests import Request, Batch
from transformers import AutoTokenizer

tokenizer_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment"
onnx_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)



In [2]:
ds_model = DeepSparseCausalLM(
    tokenizer_path=tokenizer_path,
    model_path=onnx_path
)

Using pad_token, but it is not set yet.
2023-08-22 01:33:25 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)


deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Scheduler.default
	fraction_of_supported_ops: 1.0
	cpu_avx_type: avx2
	cpu_vnni: False


2023-08-22 01:33:49 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx


deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Scheduler.default
	fraction_of_supported_ops: 1.0
	cpu_avx_type: avx2
	cpu_vnni: False


In [3]:
sequence = "Finish the following function for computing a fibonacci sequence: \n\n fib(n):"

def make_n_requests(n=1):
    requests = []
    for i in range(n):
        request = Request(
            id=i,
            prompt=sequence,
        )
        requests.append(request)
    return requests

batch_size = 2
batch = Batch(
    id=0,
    requests = make_n_requests(n=batch_size),
)

ds_batch = DeepSparseCausalLMBatch.from_batch(
    batch=batch,
    tokenizer=tokenizer, 
)

next_batch = ds_batch
for _ in range(64):
    # print(tokenizer.batch_decode(next_batch.input_ids_list[0]))
    generation, next_batch = ds_model.generate_token(next_batch)

for input_ids in next_batch.input_ids_list:
    print(tokenizer.batch_decode(input_ids)[0])

Using pad_token, but it is not set yet.


Finish the following function for computing a fibonacci sequence: 

 fib(n):

    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fib(n-1) + fib(n-2)

# Call the function.
print(fib(5))

# This code
Finish the following function for computing a fibonacci sequence: 

 fib(n):

    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fib(n-1) + fib(n-2)

# Call the function.
print(fib(5))

# This code


## Compare to DeepSparse Pipeline

In [134]:
multitoken_length = 4

def sample_token(logits):
    assert(logits.shape[0] == 1)        # assert b=1 for now
    return np.argmax(logits[0,-1,:])  
    
def prefill_pipeline(pipeline, tokens):
    num_tokens_processed = 0
    for engine_inputs in pipeline.engine_inputs_for_prefill(tokens):
        _, logits = pipeline.multitoken_engine(engine_inputs)
        num_tokens_processed += multitoken_length
    
    if num_tokens_processed > 0:
        pipeline.engine.transfer_cache_state(cache=pipeline.multitoken_engine.kv_cache)

    run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]
    for token in tokens[num_tokens_processed:]:
        run_tokens.append(token)
        new_token, logits = pipeline.autoregressive_inference(run_tokens)
    return logits
    
pipeline._reset_engines_cache()
engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]
tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()

logits = prefill_pipeline(pipeline, tokens)
# print(logits)
tokens.append(sample_token(logits))

for _ in range(64):
    _, logits = pipeline.autoregressive_inference(tokens)
    # print(logits)
    tokens.append(sample_token(logits))

print(pipeline.tokenizer.decode(tokens))

Finish the following function for computing a fibonacci sequence: 

 fib(n):

    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fib(n-1) + fib(n-2)

# Call the function.
print(fib(5))

# This code is


In [None]:
from deepsparse import Pipeline
pipeline = Pipeline.create(
    task="text-generation", 
    model_path="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
    use_deepsparse_cache=False,
    prompt_processing_sequence_length=4,
    max_generated_tokens=64,
    sequence_length=128
)

In [101]:
next_batch.input_ids_list[0].shape

(1, 18)

In [97]:
type(next_batch.input_ids_list[0])

numpy.ndarray

In [98]:
tokenizer.decode(next_batch.input_ids_list[0])

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [80]:
print(next_batch.input_ids_list[1].shape)

(1, 20)


Using pad_token, but it is not set yet.
2023-08-21 18:14:09 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)


deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Scheduler.default
	fraction_of_supported_ops: 1.0
	cpu_avx_type: avx2
	cpu_vnni: False


2023-08-21 18:14:33 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx


deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Scheduler.default
	fraction_of_supported_ops: 1.0
	cpu_avx_type: avx2
	cpu_vnni: False


In [73]:
import numpy as np
a = np.array([np.arange(10)]*2)
b = np.array([np.arange(10)]*1)

print(b[:,-1:])
print(b.shape)

[[9]]
(1, 10)


(1, 18)
(1, 19)


In [None]:
import deepsparse
import torch
from transformers import AutoTokenizer
from server.text_generation_server.models.deepsparse_causal_lm import DeepSparseCausalLMBatch
from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling, StopSequenceCriteria

from server.text_generation_server.pb.generate_pb2 import (
    Batch,    
    Request, 
    NextTokenChooserParameters, 
    StoppingCriteriaParameters
)

In [None]:
model_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment"
onnx_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
next_token_chooser = NextTokenChooser(
    watermark=False,
    temperature=1.0,
    repetition_penalty=0.0,
    top_k=None,
    top_p=None,
    typical_p=None,
    do_sample=False,
    seed=0,
    device="cpu",
)

In [None]:
stopping_crtieria=StoppingCriteria(
    eos_token_id=tokenizer.eos_token_id,
    stop_sequence_criterias=[],
    max_new_tokens=20,
    ignore_eos_token=False,
)

In [None]:
sequence = "Finish the following function for computing a fibonacci sequence: \n\n fib(n):"

In [None]:
request_idx = 0

max_new_tokens = 64

parameters = NextTokenChooserParameters(
    watermark=False,
    temperature=1.0,
    repetition_penalty=0.0,
    do_sample=False,
    typical_p=1.0,
    top_k = 0,
    top_p = 1.0,
)

stopping_parameters = StoppingCriteriaParameters(
    max_new_tokens=max_new_tokens
)

def make_n_requests(n=1):
    requests = []
    for i in range(n):
        request = Request(
            id=request_idx,
            inputs=sequence,
            truncate=False,
            parameters=parameters,
            stopping_parameters=stopping_parameters,
            prefill_logprobs=False
        )
        requests.append(request)
    return requests

batch_size = 2
requests = make_n_requests(n=batch_size)

batch = Batch(
    id = 0,
    requests = requests,
    size=len(requests),
)

ds_batch = DeepSparseCausalLMBatch.from_pb(
    pb=batch, 
    tokenizer=tokenizer, 
    dtype=torch.float32,
    device="cpu"
)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"
os.environ["WAND_OPT_FLAGS"] = "default,~pyramids"

In [None]:
from server.text_generation_server.models.deepsparse_model import DeepSparseDecoderModel, DeepSparsePastKeyValues
from transformers import AutoTokenizer

model_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment"
onnx_path = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
ds_decoder_model = DeepSparseDecoderModel(
    onnx_file_path = onnx_path,
    sequence_length = 128,
    multitoken_length = 4,
    # singletoken_engine = ds_decoder_model.singletoken_engine,
    # multitoken_engine = ds_decoder_model.multitoken_engine
)

In [None]:
from deepsparse import Pipeline
pipeline = Pipeline.create(
    task="text-generation", 
    model_path="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none",
    use_deepsparse_cache=False,
    prompt_processing_sequence_length=4,
    max_generated_tokens=64,
    sequence_length=128
)

In [None]:
sequence = "Finish the following function for computing a fibonacci sequence: \n\n fib(n):"

In [None]:
import numpy as np
def sample_token(logits):
    assert(logits.shape[0] == 1)
    return np.argmax(logits[0,-1,:])

In [None]:
print("testing DeepSparseDecoderModel:\n")

engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]
tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()

past_key_values = DeepSparsePastKeyValues()
logits, past_key_values = ds_decoder_model.prefill(tokens, past_key_values)
tokens.append(sample_token(logits))

while len(tokens) < 64:
    logits, past_key_values = ds_decoder_model.decode(tokens, past_key_values)
    tokens.append(sample_token(logits))

print(pipeline.tokenizer.decode(tokens))

In [None]:
multitoken_length = 4

def prefill_pipeline(pipeline, tokens):
    num_tokens_processed = 0
    for engine_inputs in pipeline.engine_inputs_for_prefill(tokens):
        _, logits = pipeline.multitoken_engine(engine_inputs)
        num_tokens_processed += multitoken_length

    if num_tokens_processed > 0:
        pipeline.engine.transfer_cache_state(cache=pipeline.multitoken_engine.kv_cache)

    run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]
    for token in tokens[num_tokens_processed:]:
        run_tokens.append(token)
        new_token, logits = pipeline.autoregressive_inference(run_tokens)
    return logits
    
pipeline._reset_engines_cache()
engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]
tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()

logits = prefill_pipeline(pipeline, tokens)
tokens.append(sample_token(logits))

while len(tokens) < 64:
    _, logits = pipeline.autoregressive_inference(tokens)
    tokens.append(sample_token(logits))

print(pipeline.tokenizer.decode(tokens))

In [None]:
print(f"{sequence}{pipeline(sequences=sequence).sequences[0]}")