In [None]:
from vllm import LLM, SamplingParams
import torch

import gc

# 开启记录，并设置最多记录100000个数据点
torch.cuda.memory._record_memory_history(max_entries=100000)

llm = LLM(model="microsoft/Phi-4-mini-instruct", trust_remote_code=True)

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

sampling_params = SamplingParams(
  max_tokens=500,
  temperature=0.0,
)

output = llm.chat(messages=messages, sampling_params=sampling_params)
print("x" * 50)
print(output[0].outputs[0].text)

# 保存数据
torch.cuda.memory._dump_snapshot("dump_snapshot.pickle")

# 停掉记录，关闭snapshot
torch.cuda.memory._record_memory_history(enabled=None)

# # 释放内存
# del llm
# gc.collect()
# torch.cuda.empty_cache()


INFO 08-04 06:14:37 config.py:208] Replacing legacy 'type' key with 'rope_type'
INFO 08-04 06:14:37 config.py:549] This model supports multiple tasks: {'embed', 'score', 'reward', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 08-04 06:14:37 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='microsoft/Phi-4-mini-instruct', speculative_config=None, tokenizer='microsoft/Phi-4-mini-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_exe

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 08-04 06:14:39 model_runner.py:1115] Loading model weights took 7.1452 GB
INFO 08-04 06:14:44 worker.py:267] Memory profiling takes 4.26 seconds
INFO 08-04 06:14:44 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 08-04 06:14:44 worker.py:267] model weights take 7.15GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 8.25GiB; the rest of the memory reserved for KV Cache is 55.84GiB.
INFO 08-04 06:14:44 executor_base.py:111] # cuda blocks: 28588, # CPU blocks: 2048
INFO 08-04 06:14:44 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 3.49x
INFO 08-04 06:14:44 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.21it/s]

INFO 08-04 06:15:00 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.05 GiB
INFO 08-04 06:15:00 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 20.52 seconds





INFO 08-04 06:15:00 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it, est. speed input: 86.21 toks/s, output: 104.10 toks/s]


xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
To solve the equation 2x + 3 = 7, follow these steps:

1. Subtract 3 from both sides of the equation to isolate the term with the variable (x) on one side:
   2x + 3 - 3 = 7 - 3
   2x = 4

2. Divide both sides of the equation by 2 to solve for x:
   2x / 2 = 4 / 2
   x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.


In [None]:
import torch
from rich.console import Console
from rich.table import Table

stats = torch.cuda.memory_stats(device=0)
items = sorted(stats.items(), key=lambda kv: -kv[1])

table = Table(title="CUDA Allocator Stats", expand=False, show_lines=False)
table.add_column("Metric", justify="left")
table.add_column("Current MiB", justify="right")

for k, v in items:
    table.add_row(k, f"{v/1024**2:,.1f}")

console = Console()
console.print(table)

In [4]:
import torch

def torch_memory_available(device=0):
    free_os, total_os = torch.cuda.mem_get_info(device)  # OS级别真的剩余显存（bytes）
    res = torch.cuda.memory_reserved(device)             # caching allocator 已预留内存
    alloc = torch.cuda.memory_allocated(device)          # tensors 占用内存
    stats = torch.cuda.memory_stats(device)
    frag = stats.get("inactive_split_bytes.all.current", 0)
    free_inside = res - alloc                             # allocator 内部理论还剩多少没有使用
    can_reuse = max(free_inside - frag, 0)                # 考虑碎片化之后 PyTorch 可用部分
    return {
        "free_os": free_os,           # 其实 OS 级可用的显存（包括其他应用预留）
        "free_inside": free_inside,   # PyTorch 直接复用缓存池可用大小
        "fragmented": frag,           # 分配碎片造成的不可复用部分
        "usable_by_pytorch": can_reuse,
        "reserved": res,
        "allocated": alloc,
    }

d = torch_memory_available(0)
print({k: v / 1024**2 for k, v in d.items()})

{'free_os': 78042.5625, 'free_inside': 58.875, 'fragmented': 58.875, 'usable_by_pytorch': 0.0, 'reserved': 108.0, 'allocated': 49.125}
