In [1]:
import time
import torch
import numpy as np
from torchvision import models

In [2]:
mode_list = "default reduce-overhead max-autotune".split()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 实验一：sin函数

In [14]:
def sin_func(x):
    return torch.sin(x) + torch.cos(x)

run_times = 100000
i_data = torch.tensor(1).to(device)
for mode in mode_list:
    torch.cuda.synchronize()
    time_0 = time.time()
    module_compiled = torch.compile(sin_func, mode=mode)
    torch.cuda.synchronize()
    time_1 = time.time()
    
    # warmup
    sin_func(i_data)
    module_compiled(i_data)
    
    torch.cuda.synchronize()
    time_2 = time.time()
    for i in range(run_times):
        sin_func(i_data)
        
    torch.cuda.synchronize()
    time_3 = time.time()
    for i in range(run_times):
        module_compiled(i_data)
    torch.cuda.synchronize()
    time_4 = time.time()
    
    compile_time = time_1 - time_0
    pre_time = time_3 - time_2
    post_time = time_4 - time_3
    speedup_ratio = (pre_time - post_time)/pre_time
    
    print(f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009] WON'T CONVERT sin_func C:\Users\Lenovo\AppData\Local\Temp\ipykernel_8016\3035411914.py line 1 
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009] due to: 
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009] Traceback (most recent call last):
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 948, in __call__
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009]     result = self._inner_convert(
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009]              ^^^^^^^^^^^^^^^^^^^^
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 472, in __call__
W0806 10:28:53.301000 10632 torch\_dynamo\convert_frame.py:1009]     return _compile(
W0806 10:28:53.301000 10632 torc

mode: default, 编译耗时:0.00，编译前运行耗时:2.91, 编译后运行耗时:3.75，速度提升比例:-28.79%
mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:2.87, 编译后运行耗时:3.65，速度提升比例:-26.91%
mode: max-autotune, 编译耗时:0.00，编译前运行耗时:2.89, 编译后运行耗时:3.71，速度提升比例:-28.32%


# 实验二：resnet18

In [7]:
resnet18 = models.resnet18().to(device)
resnet18.eval()
fake_img = torch.randn(16, 3, 224, 224).to(device)

In [8]:
run_times = 100
with torch.no_grad():
    for mode in mode_list:
        torch.cuda.synchronize()
        time_0 = time.time()
        module_compiled = torch.compile(resnet18, mode=mode)
        torch.cuda.synchronize()
        time_1 = time.time()
        
        # warmup 非常关键！
        resnet18(fake_img)
        module_compiled(fake_img)
        
        #
        torch.cuda.synchronize()
        time_2 = time.time()
        for i in range(run_times):
            resnet18(fake_img)
        
        torch.cuda.synchronize()
        time_3 = time.time()
        for i in range(run_times):
            module_compiled(fake_img)
        
        torch.cuda.synchronize()
        time_4 = time.time()

        compile_time = time_1 - time_0
        pre_time = time_3 - time_2
        post_time = time_4 - time_3
        speedup_ratio = (pre_time - post_time)/pre_time

        print(f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009] WON'T CONVERT forward G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torchvision\models\resnet.py line 284 
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009] due to: 
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009] Traceback (most recent call last):
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 948, in __call__
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009]     result = self._inner_convert(
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009]              ^^^^^^^^^^^^^^^^^^^^
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 472, in __call__
W0806 09:48:56.029000 10632 torch\_dynamo\convert_frame.py:1009]     return _compile(
W0806 09:48:56.0290

mode: default, 编译耗时:0.00，编译前运行耗时:2.81, 编译后运行耗时:2.80，速度提升比例:0.27%
mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:2.78, 编译后运行耗时:2.81，速度提升比例:-0.95%
mode: max-autotune, 编译耗时:0.00，编译前运行耗时:2.86, 编译后运行耗时:2.79，速度提升比例:2.29%


# 实验三：BERT

In [10]:
from transformers import BertModel, BertTokenizer
import time

bert = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
# 准备一批输入数据
input_text = "Here is some text to encode"
inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
bert.to(device)
bert.eval()

run_times = 100
with torch.no_grad():
    for mode in mode_list:
        
        # 编译
        torch.cuda.synchronize()
        time_0 = time.time()
        bert_compiled = torch.compile(bert, mode=mode)
        torch.cuda.synchronize()
        time_1 = time.time()
        
        # warmup 非常关键！
        bert(**inputs)
        bert_compiled(**inputs)

        torch.cuda.synchronize()
        time_2= time.time()
        for _ in range(run_times): 
            _ = bert(**inputs)

        torch.cuda.synchronize()
        time_3= time.time()
        for _ in range(run_times):
            _ = bert_compiled(**inputs)
        
        torch.cuda.synchronize()
        time_4= time.time()
        
        compile_time = time_1 - time_0
        pre_time = time_3 - time_2
        post_time = time_4 - time_3
        speedup_ratio = (pre_time - post_time)/pre_time
        
        
        print(f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009] WON'T CONVERT forward G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\transformers\models\bert\modeling_bert.py line 996 
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009] due to: 
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009] Traceback (most recent call last):
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 948, in __call__
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009]     result = self._inner_convert(
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009]              ^^^^^^^^^^^^^^^^^^^^
W0806 09:58:13.110000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 472, in __call__
W0806 09:58:13.110000 1063

mode: default, 编译耗时:0.00，编译前运行耗时:0.86, 编译后运行耗时:0.92，速度提升比例:-7.90%
mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:0.78, 编译后运行耗时:0.91，速度提升比例:-16.75%
mode: max-autotune, 编译耗时:0.00，编译前运行耗时:0.78, 编译后运行耗时:0.90，速度提升比例:-15.76%


# 实验四 numpy

In [13]:
run_times = 100

def numpy_fn2(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))

def numpy_fn(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    # Step 1: Normalize the input arrays to have zero mean and unit variance
    X_mean, X_std = X.mean(axis=0), X.std(axis=0)
    Y_mean, Y_std = Y.mean(axis=0), Y.std(axis=0)
    
    # Avoid division by zero in case of zero standard deviation
    X_std[X_std == 0] = 1
    Y_std[Y_std == 0] = 1
    
    X_normalized = (X - X_mean) / X_std
    Y_normalized = (Y - Y_mean) / Y_std
    
    # Step 2: Perform the tensor product followed by sum over last two dimensions
    intermediate_result = np.sum(X_normalized[:, :, None] * Y_normalized[:, None, :], axis=(-2, -1))
    
    # Step 3: Apply thresholding to clip values outside of [-1, 1]
    intermediate_result = np.clip(intermediate_result, -1, 1)
    
    # Step 4: Apply exponential function for non-linearity
    result = np.exp(intermediate_result)
    
    # Step 5: Add a small regularization term to avoid overfitting
    regularization_term = 0.001 * np.sum(X_normalized ** 2 + Y_normalized ** 2, axis=1)
    result += regularization_term
    
    return result

x = np.random.randn(1024, 640)
y = np.random.randn(1024, 640)

for mode in mode_list:
    torch.cuda.synchronize()
    time_0 = time.time()
    numpy_fn_compiled = torch.compile(numpy_fn, mode=mode)
    torch.cuda.synchronize()
    time_1 = time.time()

    # warmup 非常关键！
    numpy_fn(x, y)
    numpy_fn_compiled(x, y)

    #
    torch.cuda.synchronize()
    time_2 = time.time()
    for i in range(run_times):
        numpy_fn(x, y)

    torch.cuda.synchronize()
    time_3 = time.time()
    for i in range(run_times):
        numpy_fn_compiled(x, y)

    torch.cuda.synchronize()
    time_4 = time.time()

    compile_time = time_1 - time_0
    pre_time = time_3 - time_2
    post_time = time_4 - time_3
    speedup_ratio = (pre_time - post_time)/pre_time

    print(f"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}")

W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009] WON'T CONVERT numpy_fn C:\Users\Lenovo\AppData\Local\Temp\ipykernel_8016\3604335613.py line 6 
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009] due to: 
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009] Traceback (most recent call last):
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 948, in __call__
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009]     result = self._inner_convert(
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009]              ^^^^^^^^^^^^^^^^^^^^
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009]   File "G:\Anaconda3\envs\pytorch2.4\Lib\site-packages\torch\_dynamo\convert_frame.py", line 472, in __call__
W0806 10:14:59.652000 10632 torch\_dynamo\convert_frame.py:1009]     return _compile(
W0806 10:14:59.652000 10632 torc

mode: default, 编译耗时:0.00，编译前运行耗时:139.91, 编译后运行耗时:140.94，速度提升比例:-0.74%
mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:134.90, 编译后运行耗时:137.07，速度提升比例:-1.61%
mode: max-autotune, 编译耗时:0.00，编译前运行耗时:136.81, 编译后运行耗时:137.11，速度提升比例:-0.22%
