In [2]:
import torch
# 首先确认您的 PyTorch 版本是否支持 Metal 后端（Apple GPU 加速）
print(torch.version.__version__)  # 应显示2.0.0或更高
print(torch.backends.mps.is_available())  # 应显示True
print(torch.backends.mps.is_built())     # 应显示True

# 设备配置 - 启用AMD Radeon GPU加速
# 设置设备，优先使用GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"使用训练设备: {device}")

2.2.2
True
True
使用训练设备: mps


In [2]:
import torch
import time

def test_performance(shape, device, use_fp16=True):
    """测试矩阵乘法性能（不使用Dynamo）"""
    device = torch.device(device)
    dtype = torch.float16 if use_fp16 else torch.float32
    
    # 创建并重用张量（减少内存分配）
    a = torch.randn(shape, device=device, dtype=dtype)
    b = torch.randn(shape, device=device, dtype=dtype)
    test_epoch = 3 # 测试周期数
    epoch = 5  #周期 
    
    # 预热
    for _ in range(test_epoch):
        c = a @ b
    if device.type == 'mps':
        torch.mps.synchronize()
    
    # 计时
    start = time.time()
    for _ in range(epoch):
        c = a @ b
    if device.type == 'mps':
        torch.mps.synchronize()
    end = time.time()
    
    return (end - start) / epoch

# 测试不同尺寸
matrix_sizes = [100, 1000, 5000, 10000]
print(f"{'矩阵尺寸':<12}{'设备':<8}{'数据类型':<8}{'平均时间(s)':<16}")
print("-" * 50)

# 测试CPU
for n in matrix_sizes:
    shape = (n, n)
    cpu_time = test_performance(shape, "cpu", use_fp16=False)
    print(f"{n}x{n:<8}{'CPU':<8}{'FP32':<8}{cpu_time:.6f}")

# 测试MPS
if torch.backends.mps.is_available():
    for n in matrix_sizes:
        shape = (n, n)
        mps_time = test_performance(shape, "mps", use_fp16=True)
        print(f"{n}x{n:<8}{'MPS':<8}{'FP16':<8}{mps_time:.6f}")
else:
    print("MPS不可用")

矩阵尺寸        设备      数据类型    平均时间(s)         
--------------------------------------------------
100x100     CPU     FP32    0.000021
1000x1000    CPU     FP32    0.006491
5000x5000    CPU     FP32    0.719043
10000x10000   CPU     FP32    9.737196
100x100     MPS     FP16    0.001767
1000x1000    MPS     FP16    0.007567
5000x5000    MPS     FP16    0.997615
10000x10000   MPS     FP16    7.624653


In [1]:
import torch
import time

def test_compiled_performance(shape, device, use_fp16=True, use_compile=True):
    """测试编译后的矩阵乘法性能"""
    device = torch.device(device)
    dtype = torch.float16 if use_fp16 else torch.float32
    
    # 创建张量
    a = torch.randn(shape, device=device, dtype=dtype)
    b = torch.randn(shape, device=device, dtype=dtype)
    
    # 定义计算函数
    def matmul(a, b):
        return a @ b
    
    # 编译函数
    if use_compile:
        matmul = torch.compile(matmul, backend="inductor")
    
    # 预热
    for _ in range(2):
        c = matmul(a, b)
    if device.type == 'mps':
        torch.mps.synchronize()
    
    # 计时
    start = time.perf_counter()
    for _ in range(5):
        c = matmul(a, b)
    if device.type == 'mps':
        torch.mps.synchronize()
    end = time.perf_counter()
    
    return (end - start) / 5

# 测试不同尺寸
matrix_sizes = [5000, 10000]
print(f"{'矩阵尺寸':<12}{'设备':<8}{'数据类型':<8}{'是否编译':<8}{'平均时间(s)':<16}")
print("-" * 60)

# 测试MPS性能（编译与未编译对比）
if torch.backends.mps.is_available():
    for n in matrix_sizes:
        shape = (n, n)
        
        # 未编译
        time_base = test_compiled_performance(shape, "mps", use_fp16=True, use_compile=False)
        print(f"{n}x{n:<8}{'MPS':<8}{'FP16':<8}{'否':<8}{time_base:.6f}")
        
        # 编译
        time_compiled = test_compiled_performance(shape, "mps", use_fp16=True, use_compile=True)
        print(f"{n}x{n:<8}{'MPS':<8}{'FP16':<8}{'是':<8}{time_compiled:.6f}")
        print(f"加速比: {time_base/time_compiled:.2f}x")

矩阵尺寸        设备      数据类型    是否编译    平均时间(s)         
------------------------------------------------------------
5000x5000    MPS     FP16    否       0.971096


RuntimeError: Dynamo is not supported on Python 3.12+