Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
13faa33
add cpuruntime dialect
Menooker May 14, 2024
161848e
format
Menooker May 14, 2024
447ef12
add dependency
Menooker May 14, 2024
a73dcc1
fix new MLIR
Menooker May 14, 2024
1cfede8
add
Menooker May 15, 2024
57ba92e
Merge remote-tracking branch 'origin/main' into yijie/cpuruntime
Menooker May 15, 2024
4d25de6
Merge branch 'yijie/cpuruntime' into yijie/pipeline
Menooker May 15, 2024
475faf8
update
Menooker May 15, 2024
0ac087d
fix
Menooker May 15, 2024
74b0d34
remove at exit
Menooker May 16, 2024
2cebba9
fix lint
Menooker May 16, 2024
d1b35a1
Merge branch 'yijie/cpuruntime' into yijie/pipeline
Menooker May 16, 2024
34d10ea
Add kmp_* wrapper for gomp environment
Menooker May 16, 2024
55c1043
Merge remote-tracking branch 'origin' into yijie/pipeline
Menooker May 16, 2024
e1490bb
Merge branch 'yijie/fake_omp' into yijie/pipeline
Menooker May 16, 2024
80a597f
fix
Menooker May 16, 2024
0b4332b
fix
Menooker May 16, 2024
c43f481
Merge branch 'main' into yijie/fake_omp
Menooker May 23, 2024
382171b
fix lint
Menooker May 23, 2024
ef75da8
Merge branch 'yijie/fake_omp' of https://github.com/intel/graph-compi…
Menooker May 23, 2024
f1fd0ae
fix
Menooker May 23, 2024
a773ea6
f
Menooker May 23, 2024
84933c2
fix
Menooker May 23, 2024
4cca4df
add reference
Menooker May 23, 2024
70c5e97
Merge branch 'main' of https://github.com/intel/graph-compiler into y…
Menooker May 28, 2024
1e06c98
fix license.py
Menooker May 28, 2024
3f656b7
Merge branch 'yijie/fake_omp' into yijie/pipeline
Menooker May 28, 2024
381677a
fix comments
Menooker May 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions include/gc/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,34 @@
#include "mlir/Pass/Pass.h"

namespace mlir {

namespace LLVM {
class LLVMDialect;
}

namespace scf {
class SCFDialect;
}

namespace openmp {
class OpenMPDialect;
}

namespace linalg {
class LinalgDialect;
}

namespace MemRef {
class MemRefDialect;
}

class PassManager;

namespace gc {

void populateFrontendPasses(mlir::PassManager &);
void populateCPUPipeline(mlir::PassManager &);

#define GEN_PASS_DECL
#include "gc/Transforms/Passes.h.inc"

Expand Down
13 changes: 13 additions & 0 deletions include/gc/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,17 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> {
];
}

def GCCPUPipeline: Pass<"gc-cpu-pipeline"> {
let summary = "All-in-one pipeline for GC for CPU";
let dependentDialects = ["onednn_graph::OneDNNGraphDialect",
"tensor::TensorDialect",
"memref::MemRefDialect",
"linalg::LinalgDialect",
"LLVM::LLVMDialect",
"scf::SCFDialect",
"bufferization::BufferizationDialect",
"omp::OpenMPDialect",
"vector::VectorDialect"];
}

#endif // GC_DIALECT_GC_PASSES
3 changes: 2 additions & 1 deletion lib/gc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ include(functions)

add_subdirectory(CAPI)
add_subdirectory(Dialect)
add_subdirectory(Transforms)
add_subdirectory(Transforms)
add_subdirectory(ExecutionEngine)
1 change: 1 addition & 0 deletions lib/gc/ExecutionEngine/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_subdirectory(CPURuntime)
15 changes: 15 additions & 0 deletions lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
find_package(OpenMP REQUIRED)

if ("iomp" IN_LIST OpenMP_C_LIB_NAMES OR "omp" IN_LIST OpenMP_C_LIB_NAMES OR "omp5" IN_LIST OpenMP_C_LIB_NAMES)
else()
add_definitions("-DGC_NEEDS_OMP_WRAPPER=1")
endif()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_mlir_library(GCCpuRuntime
SHARED
Parallel.cpp

EXCLUDE_FROM_LIBMLIR
)
188 changes: 188 additions & 0 deletions lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
//===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <assert.h>
#include <atomic>
#include <chrono>
#include <immintrin.h>
#include <omp.h>
#include <stdarg.h>

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

#define WEAK_SYMBOL __attribute__((weak))

namespace {
struct barrier_t {
alignas(64) std::atomic<int32_t> pending_;
std::atomic<int32_t> rounds_;
uint64_t total_;
// pad barrier to size of cacheline to avoid false sharing
char padding_[64 - 4 * sizeof(int32_t)];
};

using barrier_idle_func = uint64_t (*)(std::atomic<int32_t> *remaining,
int32_t expected_remain, int32_t tid,
void *args);
} // namespace

extern "C" {
int gc_runtime_keep_alive = 0;
void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func,
void *idle_args) {
auto cur_round = b->rounds_.load(std::memory_order_acquire);
auto cnt = --b->pending_;
assert(cnt >= 0);
if (cnt == 0) {
b->pending_.store(b->total_);
b->rounds_.store(cur_round + 1);
} else {
if (idle_func) {
if (cur_round != b->rounds_.load()) {
return;
}
idle_func(&b->rounds_, cur_round + 1, -1, idle_args);
}
while (cur_round == b->rounds_.load()) {
_mm_pause();
}
}
}

static_assert(sizeof(barrier_t) == 64, "size of barrier_t should be 64-byte");

void gc_init_barrier(barrier_t *b, int num_barriers, uint64_t thread_count) {
for (int i = 0; i < num_barriers; i++) {
b[i].total_ = thread_count;
b[i].pending_.store(thread_count);
b[i].rounds_.store(0);
}
}

#if GC_NEEDS_OMP_WRAPPER
void WEAK_SYMBOL __kmpc_barrier(void *loc, int32_t global_tid) {
#pragma omp barrier
}

int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) {
return omp_get_thread_num();
}

// The implementation was extracted and simplified from LLVM libomp
// at openmp/runtime/src/kmp_sched.cpp
void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid,
int32_t schedtype,
int32_t *plastiter, uint64_t *plower,
uint64_t *pupper, int64_t *pstride,
int64_t incr, int64_t chunk) {
if (unlikely(schedtype != 34)) {
std::abort();
}
const int32_t FALSE = 0;
const int32_t TRUE = 1;
using UT = uint64_t;
// using ST = int64_t;
/* this all has to be changed back to TID and such.. */
uint32_t tid = gtid;
uint32_t nth = omp_get_num_threads();
UT trip_count;

/* special handling for zero-trip loops */
if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
if (plastiter != nullptr)
*plastiter = FALSE;
/* leave pupper and plower set to entire iteration space */
*pstride = incr; /* value should never be used */
return;
}

if (nth == 1) {
if (plastiter != nullptr)
*plastiter = TRUE;
*pstride =
(incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
return;
}

/* compute trip count */
if (incr == 1) {
trip_count = *pupper - *plower + 1;
} else if (incr == -1) {
trip_count = *plower - *pupper + 1;
} else if (incr > 0) {
// upper-lower can exceed the limit of signed type
trip_count = (UT)(*pupper - *plower) / incr + 1;
} else {
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
}
if (trip_count < nth) {
if (tid < trip_count) {
*pupper = *plower = *plower + tid * incr;
} else {
// set bounds so non-active threads execute no iterations
*plower = *pupper + (incr > 0 ? 1 : -1);
}
if (plastiter != nullptr)
*plastiter = (tid == trip_count - 1);
} else {
UT small_chunk = trip_count / nth;
UT extras = trip_count % nth;
*plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
*pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
if (plastiter != nullptr)
*plastiter = (tid == nth - 1);
}
*pstride = trip_count;
}

void WEAK_SYMBOL __kmpc_for_static_fini(void *ptr, int32_t v) {}

static thread_local int next_num_threads = 0;

/*!
@ingroup PARALLEL
The type for a microtask which gets passed to @ref __kmpc_fork_call().
The arguments to the outlined function are
@param global_tid the global thread identity of the thread executing the
function.
@param bound_tid the local identity of the thread executing the function
@param ... pointers to shared variables accessed by the function.
*/
using kmpc_micro = void (*)(int32_t *global_tid, int32_t *bound_tid, ...);
void WEAK_SYMBOL __kmpc_fork_call(void *loc, int32_t argc, void *pfunc, ...) {
if (unlikely(argc != 1 && argc != 0)) {
std::abort();
}
va_list ap;
va_start(ap, pfunc);
void *c = va_arg(ap, void *);
int32_t global_tid = 0;
if (unlikely(next_num_threads)) {
#pragma omp parallel num_threads(next_num_threads)
{
kmpc_micro func = (kmpc_micro)(pfunc);
func(&global_tid, nullptr, c);
}
next_num_threads = 0;
} else {
#pragma omp parallel
{
kmpc_micro func = (kmpc_micro)(pfunc);
func(&global_tid, nullptr, c);
}
}
va_end(ap);
}

void WEAK_SYMBOL __kmpc_push_num_threads(void *loc, int32_t global_tid,
int32_t num_threads) {
next_num_threads = num_threads;
}
#endif
}
1 change: 1 addition & 0 deletions lib/gc/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ gc_set_mlir_link_components(MLIR_LINK_COMPONENTS

add_mlir_library(GCPasses
OneDNNGraphToLinalg.cpp
Pipeline.cpp
TileNamed.cpp

ADDITIONAL_HEADER_DIRS
Expand Down
Loading