In [1]:
import onnxruntime_genai as og
import numpy as np
import os
import time
import psutil

In [2]:
model_folder = "Your DeepSeek-R1-Distill-Qwen-1.5B-ONNX location"

In [3]:

process = psutil.Process(os.getpid())

mem_before_load = process.memory_info().rss / (1024 * 1024)

In [4]:
model = og.Model(model_folder)

In [5]:

mem_after_load = process.memory_info().rss / (1024 * 1024)
print(f"Memory before model loading: {mem_before_load:.2f} MB, Memory after loading: {mem_after_load:.2f} MB, Added: {mem_after_load - mem_before_load:.2f} MB")

Memory before model loading: 80.07 MB, Memory after loading: 1083.63 MB, Added: 1003.56 MB


In [6]:
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()

In [7]:
search_options = {}
search_options['max_length'] = 2048
search_options['past_present_share_buffer'] = False

In [8]:
chat_template = "<|user|>{input}<|assistant|>"

In [9]:
text = """Find all pairwise different isomorphism groups of order 147 that do not contain elements of order 49"""

In [10]:
prompt = f'{chat_template.format(input=text)}'

In [11]:
input_tokens = tokenizer.encode(prompt)

In [12]:
params = og.GeneratorParams(model)

In [13]:
params.set_search_options(**search_options)
params.input_ids = input_tokens

In [14]:
generator = og.Generator(model, params)

In [15]:

start_time = time.time()

first_token_time = None
token_count = 0

In [16]:
while not generator.is_done():
      generator.compute_logits()
      generator.generate_next_token()

      new_token = generator.get_next_tokens()[0]
      token_text = tokenizer.decode(new_token)
      # print(tokenizer_stream.decode(new_token), end='', flush=True)
      if token_count == 0:
        first_token_time = time.time()
        first_response_latency = first_token_time - start_time
        print(f"firstly token delpay: {first_response_latency:.4f} s")

      print(token_text, end='', flush=True)
      token_count += 1

firstly token delpay: 0.5723 s
Yes, I can help. Let me start by recalling some group theory concepts.

First, I know that 147 is equal to 3 times 49, which is 3 times 7 squared. So, 147 = 3 * 7^2. This is useful because it tells me that the group of order 147 is a semi-direct product of a group of order 3 and a group of order 49. 

Wait, but 49 is 7 squared, so the group of order 49 is an abelian group since 7 is a prime. Therefore, the group of order 147 is a semi-direct product of a group of order 3 and an abelian group of order 49. 

So, the first step is to figure out all possible semi-direct products of these groups. The number of such semi-direct products depends on the number of automorphisms of the group of order 49. 

Since 49 is 7 squared, the automorphism group of the group of order 49 is isomorphic to the multiplicative group modulo 49. That is, Aut(C49) �� (Z/49Z)*. The order of this group is φ(49) = φ(7^2) = 7^2 - 7 = 49 - 7 = 42. So, Aut(C49) has order 42. 

Therefore, t

In [17]:
end_time = time.time()
total_time = end_time - start_time
print("\n--- Inference ends ---")
print(f"A total of  {token_count} tokens were output, taking {total_time:.4f} second")

if token_count > 0:
    average_speed = token_count / total_time
    print(f"Average generation rate: {average_speed:.2f} tokens/second")

mem_after_inference = process.memory_info().rss / (1024 * 1024)
print(f"Memory usage after inference: {mem_after_inference:.2f} MB, change from after loading {mem_after_inference - mem_after_load:.2f} MB")


--- Inference ends ---
A total of  2014 tokens were output, taking 284.1676 second
Average generation rate: 7.09 tokens/second
Memory usage after inference: 1366.44 MB, change from after loading 282.80 MB
