### Run `BitNet`

In [None]:
# Run below command to run `BitNet`
# python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0

#### Simple Usage

In [4]:
import os
import sys
import platform
import subprocess

def run_command(command, shell=False):
    """Run a system command and ensure it succeeds."""
    try:
        subprocess.run(command, shell=shell, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while running command: {e}")
        sys.exit(1)

def run_inference(model, n_predict, prompt, threads=2, ctx_size=2048, temperature=0.8):
    build_dir = "build"
    if platform.system() == "Windows":
        main_path = os.path.join(build_dir, "bin", "Release", "llama-cli.exe")
        if not os.path.exists(main_path):
            main_path = os.path.join(build_dir, "bin", "llama-cli")
    else:
        main_path = os.path.join(build_dir, "bin", "llama-cli")
    
    command = [
        f'{main_path}',
        '-m', model,
        '-n', str(n_predict),
        '-t', str(threads),
        '-p', prompt,
        '-ngl', '0',
        '-c', str(ctx_size),
        '--temp', str(temperature),
        "-b", "1"
    ]
    run_command(command)
    print(f"Command:\n{' '.join(command)}")

In [5]:
# Set up the variables
model = "models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf"
n_predict = 6
prompt = "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:"
temperature = 0

In [6]:
# Run the inference
run_inference(model, n_predict, prompt, temperature=temperature)

build: 3947 (406a5036) with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.4.0
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Llama3-8B-1.58-100B-tokens
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama

Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
Answer: Mary is in the garden.


Command:
build/bin/llama-cli -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -n 6 -t 2 -p Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
Answer: -ngl 0 -c 2048 --temp 0 -b 1


llama_perf_sampler_print:    sampling time =       0.24 ms /    54 runs   (    0.00 ms per token, 225000.00 tokens per second)
llama_perf_context_print:        load time =     511.93 ms
llama_perf_context_print: prompt eval time =    3079.14 ms /    48 tokens (   64.15 ms per token,    15.59 tokens per second)
llama_perf_context_print:        eval time =     320.55 ms /     5 runs   (   64.11 ms per token,    15.60 tokens per second)
llama_perf_context_print:       total time =    3400.64 ms /    53 tokens
ggml_metal_free: deallocating


### Encapsulate the `BitNet`

In [6]:
import os
import sys
import psutil
import platform
import subprocess

class BitNet:
    def __init__(self, model_path, build_dir="build"):
        self.model_path = model_path
        self.build_dir = build_dir
        self.main_path = self._get_main_path()
        self.process = psutil.Process(os.getpid())

    def _get_main_path(self):
        if platform.system() == "Windows":
            main_path = os.path.join(self.build_dir, "bin", "Release", "llama-cli.exe")
            if not os.path.exists(main_path):
                main_path = os.path.join(self.build_dir, "bin", "llama-cli")
        else:
            main_path = os.path.join(self.build_dir, "bin", "llama-cli")
        return main_path

    def _get_memory_usage(self):
        return self.process.memory_info().rss / (1024 * 1024)  # Convert to MB

    def _run_command(self, command, shell=False):
        """Run a system command and ensure it succeeds."""
        try:
            result = subprocess.run(command, shell=shell, check=True, capture_output=True, text=True)
            return result.stdout
        except subprocess.CalledProcessError as e:
            print(f"Error occurred while running command: {e}")
            print(f"Error output: {e.stderr}")
            sys.exit(1)

    def run_inference(self, prompt, n_predict=20, threads=2, ctx_size=2048, temperature=0.8):
        initial_memory = self._get_memory_usage()

        command = [
            self.main_path,
            '-m', self.model_path,
            '-n', str(n_predict),
            '-t', str(threads),
            '-p', prompt,
            '-ngl', '0',
            '-c', str(ctx_size),
            '--temp', str(temperature),
            "-b", "1"
        ]
        output = self._run_command(command)

        final_memory = self._get_memory_usage()
        memory_used = final_memory - initial_memory

        print(f"Memory usage: Initial: {initial_memory:.2f} MB, Final: {final_memory:.2f} MB")
        print(f"Memory used for inference: {memory_used:.2f} MB")
        return output

model_path = "models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf"
bitnet = BitNet(model_path)

prompt = "Hi My name is Joonhyung Lee"
result = bitnet.run_inference(prompt, n_predict=6, temperature=1.0)
print(result)

Memory usage: Initial: 61.98 MB, Final: 62.00 MB
Memory used for inference: 0.02 MB
Hi My name is Joonhyung Lee My age is 23 I


