Skip to content

Commit

Permalink
Add PYTORCH_NVFUSER_DUMP options to save PTX and CUBIN (pytorch#1916)
Browse files Browse the repository at this point in the history
Also cleaned up the relevant code path a little
  • Loading branch information
naoyam committed Aug 17, 2022
1 parent 5eefa9a commit 6e8f953
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 86 deletions.
122 changes: 41 additions & 81 deletions torch/csrc/jit/codegen/cuda/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,34 @@ void initializeCudaContext() {
}
}

namespace {

// Dump PTX or CUBIN to a file
void dumpCompiledCodeToFile(
const nvrtcProgram& program,
int fusion_id,
bool dump_cubin) {
const auto getSize = dump_cubin
? at::globalContext().getNVRTC().nvrtcGetCUBINSize
: at::globalContext().getNVRTC().nvrtcGetPTXSize;
const auto getCode = dump_cubin ? at::globalContext().getNVRTC().nvrtcGetCUBIN
: at::globalContext().getNVRTC().nvrtcGetPTX;
size_t size = 0;
AT_CUDA_NVRTC_CHECK(getSize(program, &size));
std::vector<char> code(size);
AT_CUDA_NVRTC_CHECK(getCode(program, code.data()));
std::stringstream file_name;
file_name << "__tmp_kernel" << fusion_id << "."
<< (dump_cubin ? "cubin" : "ptx");
std::cout << "PRINTING: " << file_name.str() << std::endl;
std::ofstream out(file_name.str());
TORCH_INTERNAL_ASSERT(out.is_open());
out.write(code.data(), size);
out.close();
}

} // namespace

std::pair<NvrtcFunction, std::string> nvrtcCompile(
const std::string& code,
const std::string& func_name,
Expand Down Expand Up @@ -1183,92 +1211,24 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
AT_CUDA_NVRTC_CHECK(getFunc(program, ptx.data()));
}

NvrtcFunction compiled_kernel_;

// TODO: We do go through different code path, should investigate whether this
// has an impact on generated binary.
#ifndef __HIP_PLATFORM_HCC__
const char* prefix_env = getenv("PYTORCH_NVFUSER_CUBIN");
if (prefix_env) {
FUSER_PERF_SCOPE("executor_utils::Nvrtc::LoadCUBIN");

// Output ptx file
std::stringstream output_file_name;
output_file_name << prefix_env << "_" << id
<< (compile_to_sass ? ".cubin" : ".ptx");
std::ofstream outputFile(output_file_name.str().c_str(), std::ios::out);
if (outputFile.is_open()) {
outputFile.write(ptx.data(), ptx.size());
outputFile.close();
}

if (compile_to_sass) {
FUSER_PERF_SCOPE("executor_utils::Nvrtc::LoadPTX");

// load sass directly
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadDataEx(
&(compiled_kernel_.module),
ptx.data(),
options.size(),
options.data(),
option_vals.data()));
} else {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
CUlinkState linkState;

AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLinkCreate(
// 0, nullptr, nullptr, &linkState));
options.size(),
options.data(),
option_vals.data(),
&linkState));

AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLinkAddData(
linkState,
CU_JIT_INPUT_PTX,
ptx.data(),
ptx_size,
"compiling PTX",
0,
nullptr,
nullptr));

if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) {
std::cout << info_log.data() << std::endl;
}

// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
size_t cubinSize;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
void* cubin;
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLinkComplete(
linkState, &cubin, &cubinSize));
if (isDebugDumpEnabled(DebugDumpOption::Ptx)) {
dumpCompiledCodeToFile(program, id, false);
}

// Output binary file
std::stringstream cubin_file_name;
cubin_file_name << prefix_env << "_" << id << ".cubin";
if (isDebugDumpEnabled(DebugDumpOption::Cubin)) {
TORCH_INTERNAL_ASSERT(
compile_to_sass,
"CUBIN not available as the kernel was compiled only to PTX");
dumpCompiledCodeToFile(program, id, true);
}

std::ofstream myCubinFile(
cubin_file_name.str().c_str(), std::ios::out | std::ios::binary);
NvrtcFunction compiled_kernel_;

if (myCubinFile.is_open()) {
myCubinFile.write(static_cast<const char*>(cubin), cubinSize);
myCubinFile.close();
}
// load compiled cubin
// AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
// &(compiled_kernel_.module), cubin));
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadDataEx(
&(compiled_kernel_.module),
cubin,
options.size(),
options.data(),
option_vals.data()));
}
} else {
#ifndef __HIP_PLATFORM_HCC__
{
FUSER_PERF_SCOPE("executor_utils::Nvrtc::LoadPTX");

// load ptx directly
// load ptx or cubin directly
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadDataEx(
&(compiled_kernel_.module),
ptx.data(),
Expand Down
8 changes: 5 additions & 3 deletions torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,12 @@ __device__ double uniform(unsigned int x, unsigned int y) {
return z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
}

__device__ double randLike(const uint4 &rng_result, int rng_component) {
return uniform((&rng_result.x)[rng_component * 2], (&rng_result.x)[rng_component * 2 + 1]);
__device__ double randLike(const uint4& rng_result, int rng_component) {
return uniform(
(&rng_result.x)[rng_component * 2],
(&rng_result.x)[rng_component * 2 + 1]);
}

__device__ float randLikef(const uint4 &rng_result, int rng_component) {
__device__ float randLikef(const uint4& rng_result, int rng_component) {
return uniformf((&rng_result.x)[rng_component]);
}
10 changes: 8 additions & 2 deletions torch/csrc/jit/codegen/cuda/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ auto parseDebugDumpOptions() {
{DebugDumpOption::Halo, false},
{DebugDumpOption::PerfDebugVerbose, false},
{DebugDumpOption::TransformPropagator, false},
{DebugDumpOption::InlinePropagator, false}};
{DebugDumpOption::InlinePropagator, false},
{DebugDumpOption::Cubin, false},
{DebugDumpOption::Ptx, false}};

if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) {
c10::string_view options_view(dump_options);
Expand Down Expand Up @@ -91,6 +93,10 @@ auto parseDebugDumpOptions() {
options_map[DebugDumpOption::TransformPropagator] = true;
} else if (token == "inline_propagator") {
options_map[DebugDumpOption::InlinePropagator] = true;
} else if (token == "cubin") {
options_map[DebugDumpOption::Cubin] = true;
} else if (token == "ptx") {
options_map[DebugDumpOption::Ptx] = true;
} else {
TORCH_CHECK(
false,
Expand All @@ -102,7 +108,7 @@ auto parseDebugDumpOptions() {
"\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n",
"\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n",
"\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n",
"\ttransform_propagator, inline_propagator\n");
"\ttransform_propagator, inline_propagator, cubin, ptx\n");
}
options_view = (end_pos != c10::string_view::npos)
? options_view.substr(end_pos + 1)
Expand Down
2 changes: 2 additions & 0 deletions torch/csrc/jit/codegen/cuda/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ enum class DebugDumpOption {
//! path and replay result
InlinePropagator, //! When running InlinePropagator, print propagation
//! path and inlining result
Cubin, //! Dump compiled CUBIN
Ptx //! Dump compiled PTX
};

TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
Expand Down

0 comments on commit 6e8f953

Please sign in to comment.