-
Notifications
You must be signed in to change notification settings - Fork 298
Closed
Description
Describe the bug
I have a testcode that fills my vra with data and shows how many is taken.
I cannot allocate more then 4gb even if i have 16. (A770)
What should i do?
I changed ReBAR in BIOS.
Versions
This is the code:
import torch
from torch import xpu
import os
os.environ['UR_L0_USE_RELAXED_ALLOCATION_LIMITS'] = '1'
def xpu_memory_test():
if not xpu.is_available():
print("XPU not available!")
return
device = torch.device("xpu")
print(f"\n=== XPU Memory Test ===")
try:
print(f"Device Name: {torch.xpu.get_device_name(device)}")
max_alloc = torch.xpu.max_memory_allocated(device) / (1024**3)
total_mem = torch.xpu.get_device_properties(device).total_memory / (1024**3)
print(f"\nDevice Memory: {total_mem:.2f}GB total")
print(f"Max allocated during session: {max_alloc:.2f}GB")
size_step = 0.1
current_size = 1.0
last_success = 0
while current_size <= total_mem:
tensor_size = int(current_size * (1024**3 / 4))
print(f"\nAttempting to allocate {current_size}GB tensor...")
try:
test_tensor = torch.empty(tensor_size, dtype=torch.float32, device=device)
torch.xpu.synchronize(device)
allocated = torch.xpu.memory_allocated(device) / (1024**3)
print(f"Success! Current allocated: {allocated:.2f}GB")
os.system("free -h")
del test_tensor
torch.xpu.empty_cache()
last_success = current_size
current_size += size_step
except RuntimeError as e:
print(f"\nAllocation failed at {current_size}GB (last success: {last_success}GB)")
print(f"Error message: {str(e)}")
os.system("free -h")
break
except Exception as e:
print(f"\nError during memory test: {str(e)}")
finally:
allocated = torch.xpu.memory_allocated(device) / (1024**3)
print(f"\n! Final allocated memory: {allocated:.2f}GB")
print("Test completed.")
if __name__ == "__main__":
xpu_memory_test()
torch.xpu.empty_cache()
Metadata
Metadata
Assignees
Labels
No labels