In [None]:
# https://livebook.manning.com/book/build-a-reasoning-model-from-scratch/chapter-2/v-3#16 
# https://github.com/rasbt/reasoning-from-scratch.git

In [3]:
!pip install -r https://raw.githubusercontent.com/rasbt/reasoning-from-scratch/refs/heads/main/requirements.txt

Collecting reasoning-from-scratch>=0.1.2 (from -r https://raw.githubusercontent.com/rasbt/reasoning-from-scratch/refs/heads/main/requirements.txt (line 1))
  Downloading reasoning_from_scratch-0.1.12-py3-none-any.whl.metadata (23 kB)
Collecting jupyterlab>=4.4.7 (from reasoning-from-scratch>=0.1.2->-r https://raw.githubusercontent.com/rasbt/reasoning-from-scratch/refs/heads/main/requirements.txt (line 1))
  Downloading jupyterlab-4.5.2-py3-none-any.whl.metadata (16 kB)
Collecting matplotlib>=3.10.7 (from reasoning-from-scratch>=0.1.2->-r https://raw.githubusercontent.com/rasbt/reasoning-from-scratch/refs/heads/main/requirements.txt (line 1))
  Downloading matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting async-lru>=1.0.0 (from jupyterlab>=4.4.7->reasoning-from-scratch>=0.1.2->-r https://raw.githubusercontent.com/r

In [4]:
from reasoning_from_scratch.qwen3 import download_qwen3_small
download_qwen3_small(kind="base", tokenizer_only=True, out_dir="qwen3")

In [5]:
from pathlib import Path
from reasoning_from_scratch.qwen3 import Qwen3Tokenizer
 
tokenizer_path = Path("qwen3") / "tokenizer-base.json"
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)

In [6]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)

In [7]:
text = tokenizer.decode(input_token_ids_list)
print(text)

Explain large language models.


In [8]:
for i in input_token_ids_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[840] --> Ex
[20772] --> plain
[3460] -->  large
[4128] -->  language
[4119] -->  models
[13] --> .


In [9]:
prompt =  "Christ est seigneur, que le monde l'accept ou pas"
input_token_ids_list = tokenizer.encode(prompt)

In [10]:
for i in input_token_ids_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[19315] --> Christ
[1788] -->  est
[511] -->  se
[622] --> ign
[12559] --> eur
[11] --> ,
[1709] -->  que
[512] -->  le
[37800] -->  monde
[326] -->  l
[6] --> '
[10330] --> accept
[5908] -->  ou
[6368] -->  pas


In [11]:
text = tokenizer.decode(input_token_ids_list)
print(text)

Christ est seigneur, que le monde l'accept ou pas


In [12]:
# Loading pre-trained models

In [15]:
import torch
 
print(f"PyTorch version {torch.__version__}")
if torch.cuda.is_available():
    print("CUDA GPU")
elif torch.mps.is_available():
    print("Apple Silicon GPU")
else:
    print("Only CPU")

PyTorch version 2.9.0+cu126
CUDA GPU


In [16]:
def get_device(enable_tensor_cores=True):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")
        
        if enable_tensor_cores:
            major, minor = map(int, torch.__version__.split(".")[:2])
            if (major, minor) >= (2, 9):
                torch.backends.cuda.matmul.fp32_precision = "tf32"
                torch.backends.cudnn.conv.fp32_precision = "tf32"
            else:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True
 
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple Silicon GPU (MPS)")
 
    elif torch.xpu.is_available():
        device = torch.device("xpu")
        print("Using Intel GPU")
 
    else:
        device = torch.device("cpu")
        print("Using CPU")
 
    return device

In [17]:
device = get_device()

Using NVIDIA CUDA GPU


In [18]:
download_qwen3_small(kind="base", tokenizer_only=False, out_dir="qwen3")

qwen3-0.6B-base.pth: 100% (1433 MiB / 1433 MiB)


In [19]:
from reasoning_from_scratch.qwen3 import Qwen3Model, QWEN_CONFIG_06_B
 
model_path = Path("qwen3") / "qwen3-0.6B-base.pth"
model = Qwen3Model(QWEN_CONFIG_06_B)
model.load_state_dict(torch.load(model_path))
model.to(device)

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

In [20]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)
print(f"Number of input tokens: {len(input_token_ids_list)}")
 
input_tensor = torch.tensor(input_token_ids_list)
input_tensor_fmt = input_tensor.unsqueeze(0)
input_tensor_fmt = input_tensor_fmt.to(device)
 
output_tensor = model(input_tensor_fmt)
output_tensor_fmt = output_tensor.squeeze(0)
print(f"Formatted Output tensor shape: {output_tensor_fmt.shape}")

Number of input tokens: 6
Formatted Output tensor shape: torch.Size([6, 151936])


In [21]:
last_token = output_tensor_fmt[-1].detach()
print(last_token)

tensor([ 7.3438,  1.9766,  7.9375,  ..., -2.3750, -2.3750, -2.3750],
       device='cuda:0', dtype=torch.bfloat16)


In [22]:
output_tensor_fmt

tensor([[ 7.4062, 11.4375,  9.2500,  ...,  3.7344,  3.7344,  3.7344],
        [ 9.3125, 10.6250,  7.1875,  ...,  3.2344,  3.2344,  3.2344],
        [10.8125, 10.0625,  7.4688,  ...,  0.1484,  0.1484,  0.1484],
        [ 7.0938,  9.1875,  6.2188,  ..., -2.0000, -2.0000, -2.0000],
        [11.5625, 13.5625, 10.2500,  ...,  1.0156,  1.0156,  1.0156],
        [ 7.3438,  1.9766,  7.9375,  ..., -2.3750, -2.3750, -2.3750]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SqueezeBackward1>)

In [23]:
print(last_token.argmax(dim=-1, keepdim=True))

tensor([20286], device='cuda:0')


In [24]:
print(tokenizer.decode([20286]))

 Large


In [None]:
@torch.inference_mode()
def generate_text_basic(
    model,
    token_ids,
    max_new_tokens,
    eos_token_id=None
):
    input_length = token_ids.shape[1]
    model.eval()
 
    for _ in range(max_new_tokens):
        out = model(token_ids)[:, -1]
        next_token = torch.argmax(out, dim=-1, keepdim=True)
 
        if (eos_token_id is not None
                and next_token.item() == eos_token_id):
            break
 
        token_ids = torch.cat(
            [token_ids, next_token], dim=1)
    return token_ids[:, input_length:]
 

In [26]:
prompt = "Explain large language models in a single sentence."
input_token_ids_tensor = torch.tensor(
    tokenizer.encode(prompt),
    device=device
    ).unsqueeze(0)
 
max_new_tokens = 100
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
)
output_text = tokenizer.decode(
    output_token_ids_tensor.squeeze(0).tolist()
)
print(output_text)

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.<|endoftext|>Human language is a complex and dynamic system that has evolved over millions of years to enable effective communication and social interaction. It is composed of a vast array of symbols, including letters, numbers, and symbols, which are used to convey meaning and express thoughts and ideas. The evolution of language has


In [28]:
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(
    output_token_ids_tensor.squeeze(0).tolist()
)
print(output_text)

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.


In [40]:
@torch.inference_mode()
def generate_text_basic_stream(
    model,
    token_ids,
    max_new_tokens,
    eos_token_id=None
):
    input_length = token_ids.shape[1]
    model.eval()
 
    for _ in range(max_new_tokens):
        out = model(token_ids)[:, -1]
        next_token = torch.argmax(out, dim=-1, keepdim=True)
 
        if (eos_token_id is not None
                and next_token.item() == eos_token_id):
            break
 
        token_ids = torch.cat(
            [token_ids, next_token], dim=1)
        yield token_ids[:, input_length:]

In [51]:
l = 0
for token in generate_text_basic_stream(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id
):
    token_list = token.squeeze(0).tolist()
    l = len(token_list)
    output_text = tokenizer.decode(token_list)
    print(output_text)

 Large
 Large language
 Large language models
 Large language models are
 Large language models are artificial
 Large language models are artificial intelligence
 Large language models are artificial intelligence systems
 Large language models are artificial intelligence systems that
 Large language models are artificial intelligence systems that can
 Large language models are artificial intelligence systems that can understand
 Large language models are artificial intelligence systems that can understand,
 Large language models are artificial intelligence systems that can understand, generate
 Large language models are artificial intelligence systems that can understand, generate,
 Large language models are artificial intelligence systems that can understand, generate, and
 Large language models are artificial intelligence systems that can understand, generate, and process
 Large language models are artificial intelligence systems that can understand, generate, and process human
 Larg

In [52]:
def generate_stats(output_token_ids, tokenizer, start_time,
                   end_time, print_tokens=True):
    total_time = end_time - start_time
    print(f"Time: {total_time:.2f} sec")
    print(f"{int(output_token_ids.numel() / total_time)} tokens/sec")
 
    for name, backend in (("CUDA", getattr(torch, "cuda", None)),
                          ("XPU", getattr(torch, "xpu", None))):
        if backend is not None and backend.is_available():
            max_mem_bytes = backend.max_memory_allocated()
            max_mem_gb = max_mem_bytes / (1024 ** 3)
            print(f"Max {name} memory allocated: {max_mem_gb:.2f} GB")
            backend.reset_peak_memory_stats()
 
    if print_tokens:
        output_text = tokenizer.decode(output_token_ids.squeeze(0).tolist())
        print(f"\n{output_text}")

In [53]:
import time
 
start_time = time.time()
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id
)
end_time = time.time()
generate_stats(output_token_ids_tensor, tokenizer, start_time, end_time)

Time: 1.69 sec
24 tokens/sec
Max CUDA memory allocated: 1.53 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.


In [55]:
# With KVCaching: Storing intermediate representation ie the concatenated previously generated token and retrieved during next token generation. 
from reasoning_from_scratch.qwen3 import KVCache
 
@torch.inference_mode()
def generate_text_basic_cache(
    model,
    token_ids,
    max_new_tokens,
    eos_token_id=None
):
 
    input_length = token_ids.shape[1]
    model.eval()
    cache = KVCache(n_layers=model.cfg["n_layers"])
    model.reset_kv_cache()
    out = model(token_ids, cache=cache)[:, -1]
 
    for _ in range(max_new_tokens):
        next_token = torch.argmax(out, dim=-1, keepdim=True)
 
        if (eos_token_id is not None
                and next_token.item() == eos_token_id):
            break
 
        token_ids = torch.cat([token_ids, next_token], dim=1)
        out = model(next_token, cache=cache)[:, -1]
 
    return token_ids[:, input_length:]
 

In [58]:
start_time = time.time()
output_token_ids_tensor = generate_text_basic_cache(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id,
)
end_time = time.time()
generate_stats(output_token_ids_tensor, tokenizer, start_time, end_time)

Time: 1.67 sec
24 tokens/sec
Max CUDA memory allocated: 1.47 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.


In [59]:
# Compile 

major, minor = map(int, torch.__version__.split(".")[:2])
if (major, minor) >= (2, 8):
    # This avoids retriggering model recompilations 
    # in PyTorch 2.8 and newer
    # if the model contains code like self.pos = self.pos + 1
    torch._dynamo.config.allow_unspec_int_on_nn_module = True
 
model_compiled = torch.compile(model)

In [60]:
for i in range(3):
    start_time = time.time()
    output_token_ids_tensor = generate_text_basic(
        model=model_compiled,
        token_ids=input_token_ids_tensor,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id
    )
    end_time = time.time()
 
    if i == 0:
        print("Warm-up run")
    else:
        print(f"Timed run {i}:")
    generate_stats(output_token_ids_tensor, tokenizer, start_time, end_time)
 
    print(f"\n{30*'-'}\n")

Warm-up run
Time: 145.99 sec
0 tokens/sec
Max CUDA memory allocated: 1.80 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.

------------------------------

Timed run 1:
Time: 0.51 sec
79 tokens/sec
Max CUDA memory allocated: 1.49 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.

------------------------------

Timed run 2:
Time: 0.51 sec
80 tokens/sec
Max CUDA memory allocated: 1.49 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing ar

In [62]:
for i in range(3):
    start_time = time.time()
    output_token_ids_tensor = generate_text_basic_cache(
        model=model_compiled,
        token_ids=input_token_ids_tensor,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id
    )
    end_time = time.time()
 
    if i == 0:
        print("Warm-up run")
        generate_stats(
        output_token_ids_tensor, tokenizer, start_time, end_time
    )
    else:
        print(f"Timed run {i}:")
        generate_stats(output_token_ids_tensor, tokenizer, start_time, end_time)
 
    print(f"\n{30*'-'}\n")
 

Warm-up run
Time: 0.59 sec
69 tokens/sec
Max CUDA memory allocated: 1.47 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.

------------------------------

Timed run 1:
Time: 0.58 sec
70 tokens/sec
Max CUDA memory allocated: 1.47 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.

------------------------------

Timed run 2:
Time: 0.59 sec
70 tokens/sec
Max CUDA memory allocated: 1.47 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing art