In [1]:
from pathlib import Path
from typing import Tuple, Union
import os
import subprocess
import tempfile

import torch 
import torch.nn as nn
import numpy as np
from torch.export import export, ExportedProgram
from executorch.exir import EdgeProgramManager, to_edge
from executorch.exir.backend.backend_api import LoweredBackendModule, to_backend
from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
from executorch.exir.backend.test.backend_with_compiler_demo import (  # noqa
    BackendWithCompilerDemo,
)
from executorch.devtools import Inspector
from profiling.profiler import Profiler, ProfilingResults, extract_stats
from quickstart import LocalPyProfiler

<executorch.exir.program._program.EdgeProgramManager object at 0x7fd7828003d0>
<executorch.exir.program._program.ExecutorchProgramManager object at 0x7fd783366650>
ProfilingResults(raw=[0.007537, 0.000714, 0.000456, 0.000432], p10=0.0004392, p50=0.000585, p90=0.005490100000000001, min=0.000432, avg=0.00228475, max=0.007537)


In [2]:
def runBenchmark(module, example_args, output="test.pte"):
    aten_dialect_program: ExportedProgram = export(module, example_args)

    edge_config = get_xnnpack_edge_compile_config()
    edge_program: EdgeProgramManager = to_edge(aten_dialect_program, compile_config=edge_config)

    # Lower the module
    edge_manager_to_backend: LoweredBackendModule = edge_program.to_backend(XnnpackPartitioner())
    print(edge_manager_to_backend)
    et_program = edge_manager_to_backend.to_executorch()
    print(et_program)

    # Serialize and save it to a file   
    save_path = output
    with open(save_path, "wb") as f:
        f.write(et_program.buffer)


    pte_runner_path = os.environ.get("PTE_RUNNER_PATH", 'runner/linux/pte_runner')
    profiler = LocalPyProfiler(pte_runner_path)
    profiling_result = profiler.profile(save_path, example_args, repeats=4)
    print(profiling_result)

In [3]:
import fused_fftconv

from kernels.test_models import FFTConvModule

B, C, N, K = 4, 8, 128, 32  # Adjust as needed
example_input = torch.randn(B, C, N, dtype=torch.float32)
example_filter = torch.randn(C, K, dtype=torch.float32)

# Instantiate module
module = FFTConvModule()

# Export and lower the module to Edge Dialect
example_args = (example_input, example_filter)

In [26]:
# fft benchmarks
runBenchmark(module, example_args, output="ptes/fftconv.pte")

<executorch.exir.program._program.EdgeProgramManager object at 0x7f6464be1b90>
<executorch.exir.program._program.ExecutorchProgramManager object at 0x7f645b78de90>
ProfilingResults(raw=[0.004835, 0.000428, 0.000271, 0.000255], p10=0.0002598, p50=0.0003495, p90=0.003512900000000001, min=0.000255, avg=0.0014472500000000002, max=0.004835)


In [4]:
import generic_winograd_conv
from kernels.test_models import WinogradConvModule

B, C, N, K = 4, 8, 128, 32  # Adjust as needed
example_input = torch.randn(B, C, N, dtype=torch.float32)
example_filter = torch.randn(C, K, dtype=torch.float32)

# Instantiate module
module = WinogradConvModule()
example_args = (example_input, example_filter)

In [5]:
# winograd benchmarks
runBenchmark(module, example_args, output="ptes/winograd.pte")

E0216 04:11:42.562000 144112 torch/export/_trace.py:1030] always_classified is unsupported.
E0216 04:11:42.563000 144112 torch/export/_trace.py:1030] always_classified is unsupported.


TorchRuntimeError: Failed running call_function multiwinograd.generic_winograd_conv.out(*(FakeTensor(..., size=(4, 8, 128)), FakeTensor(..., size=(8, 32))), **{'out': FakeTensor(..., size=(4, 8, 97))}):
The tensor has a non-zero number of elements, but its data is not allocated yet.
If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.

from user code:
   File "/home/kaloca/Stanford/treehacks25/treehacks_challenge/kernels/test_models.py", line 25, in forward
    torch.ops.multiwinograd.generic_winograd_conv.out(x, filter, out=out)

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
