-
Notifications
You must be signed in to change notification settings - Fork 88
/
patch_hostside.h
234 lines (192 loc) · 10.9 KB
/
patch_hostside.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Copyright Hugh Perkins 2016, 2017
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
What this does / how this works
===============================
The input to this module is hostside llvm ir code, probably from a *-hostraw.ll file. This file contains
calls to NVIDIA® CUDA™ kernel launch commands, such as cudaSetupArgument. What we are going to do is
to re-route these into the CUDA-on-CL runtime, performing any appropriate transformations first.
We can receive, amongst other things, arguments such as:
- int32
- int8
- float
- float *
- int32 *
- struct SomeStruct
- struct SomeStruct *
- a struct, containing pointers...
How these are going to be handled. So, in all cases, what we are going to do is to patch into the LLVM IR code
calls to methods in hostside_opencl_funcs.cpp. Then, depending on the type, we're going to handle the value in the IR
code slightly differently:
- int32, int8, float, any primitives basically:
- a call to one of the methods such as setKernelArgInt64 or setKernelArgFloat will be added to the bytecode
- the actual value of the primitive will be directly passed, as a parameter in this method
- we do need a slight hack to get this value, since the original cudaSetupArgument method will pass a pointer, whereas
we are going to pass the underlying value
- we get the underlying value by hunting for the upstream alloca
Note: I've moved this to a class, to force me to declare all the methods here :-) It's a stateless class though, everything
is static :-)
Ok, so the doc is mostly below, inside the class declaration, at the bottom of this file. So go there now :-)
*/
#pragma once
#include "struct_clone.h"
#include <string>
#include <vector>
#include <memory>
#include "llvm/IR/Module.h"
// #include "llvm/IR/Verifier.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Constants.h"
// #include "llvm/IR/ValueSymbolTable.h"
#include "llvm/IR/Instructions.h"
namespace cocl {
class ParamInfo {
public:
// ParamInfo() {}
// ParamInfo(const ParamInfo&) = delete;
// ParamInfo(llvm::Type *type, llvm::Value *value, llvm::Value pointer, bool isByVal) :
// type(type), value(value), pointer(pointer), isByVal(isByVal) {
// }
llvm::Type *typeHostsideFn = 0; // type of the arg in the hostside bytecode function declaration
llvm::Type *typeDevicesideFn = 0; // how this param is defined in device bytecode function declaration
bool hostsideByVal = false; // in hostside arg, is it byvalue?
bool devicesideByVal = false; // in hostside arg, is it byvalue?
llvm::Value *value = 0; // from the first arg to the bitcast feeding into cudaSetupArgument
llvm::Value *pointer = 0; // from cudaSetupArgument
llvm::Argument *hostsideArg = 0;
llvm::Argument *devicesideArg = 0;
int size = 0; // from CudaLaunch function call declaration
int paramIndex = 0; // in the original function, nto in our hacked around kernel function
};
// noncopyable(const noncopyable&) =delete;
// noncopyable& operator=(const noncopyable&) =delete;
class LaunchCallInfo {
public:
LaunchCallInfo() {
grid_xy_value = 0;
grid_z_value = 0;
block_xy_value = 0;
block_z_value = 0;
}
LaunchCallInfo(const LaunchCallInfo&) = delete;
LaunchCallInfo &operator=(const LaunchCallInfo&) = delete;
std::string kernelName = "";
std::vector<std::unique_ptr<ParamInfo> > params;
llvm::Value *stream;
llvm::Value *grid_xy_value;
llvm::Value *grid_z_value;
llvm::Value *block_xy_value;
llvm::Value *block_z_value;
};
class GenericCallInst {
// its children can hold a CallInst or an InvokeInst
// These instructions do almost the same thing, but their methods are not on a common
// base/interface class. So, we wrap them here, provide such a common base/interface class
public:
// GenericCallInst() {}
virtual ~GenericCallInst() {}
// The following methods create specialized GenericCallInst methods, encapsulating an InvokeInst
// or a CallInst
static std::unique_ptr<GenericCallInst> create(llvm::InvokeInst *inst);
static std::unique_ptr<GenericCallInst> create(llvm::CallInst *inst);
virtual llvm::Value *getArgOperand(int idx) = 0;
virtual llvm::Value *getOperand(int idx) = 0;
virtual llvm::Module *getModule() = 0;
virtual llvm::Instruction *getInst() = 0;
virtual void dump() = 0;
};
std::ostream &operator<<(std::ostream &os, const LaunchCallInfo &info);
std::ostream &operator<<(std::ostream &os, const PointerInfo &pointerInfo);
class GenericCallInst_Call : public GenericCallInst {
public:
GenericCallInst_Call(llvm::CallInst *inst) : inst(inst) {}
llvm::CallInst *inst;
virtual llvm::Value *getArgOperand(int idx) override { return inst->getArgOperand(idx); }
virtual llvm::Value *getOperand(int idx) override { return inst->getArgOperand(idx); }
virtual llvm::Module *getModule() override { return inst->getModule(); }
virtual llvm::Instruction *getInst() override { return inst; }
virtual void dump() override { inst->dump(); }
};
class GenericCallInst_Invoke : public GenericCallInst {
public:
GenericCallInst_Invoke(llvm::InvokeInst *inst) : inst(inst) {}
llvm::InvokeInst *inst;
virtual llvm::Value *getArgOperand(int idx) override { return inst->getArgOperand(idx); }
virtual llvm::Value *getOperand(int idx) override { return inst->getArgOperand(idx); }
virtual llvm::Module *getModule() override { return inst->getModule(); }
virtual llvm::Instruction *getInst() override { return inst; }
virtual void dump() override { inst->dump(); }
};
class PatchHostside {
public:
// returns path without the directory name (I think). this should go into some utiity class really...
static std::string getBasename(std::string path);
// handle primitive ints and floats (not arrays):
static llvm::Instruction *addSetKernelArgInst_int(llvm::Instruction *lastInst, llvm::Value *value, llvm::IntegerType *intType);
static llvm::Instruction *addSetKernelArgInst_float(llvm::Instruction *lastInst, llvm::Value *value);
// handle generic pointers. This will arrive in hostside_opencl_funcs at runtime, as a pointer to char,
// containing a virtual memory pointer (possibly with an offset added)
// the corresponding hostside_opencl_funcs method will:
// - look up the virtual mem, to get cl_mem object, and offset
// - check if it's already received a cl_mem, and if not add it to the list of cl_mems to send to the kernel
// - record the index number of this cl_mem, into the list of distinct/unique cl_mems
//
// The corresponding kernel parameters for this will be:
// - there will be a cl_mem pointer added, if this cl_mem hasnt already been received at least once
// - an offset integer parameter will be added
//
// the corresponding hostside_opencl_funcs method is:
// setKernelArgGpuBuffer(char *memory_as_charstar, int32_t elementSize)
//
// The hostside_opencl_funcs method needs no information other than the (virtual) pointer really
// currently we are providing the elementsize too, to mitigate against a crash on beignet, but I might
// remove that, whilst I get Mac radeon working :-P (since structs might not have useful element sizes)
static llvm::Instruction *addSetKernelArgInst_pointer(llvm::Instruction *lastInst, llvm::Value *value);
// this will add bytecode to pass a pointer to the cpu-side struct object to hostside_opencl_funcs, at runtime
// the entry point into hostside_opencl_funcs will be:
// setKernelArgHostsideBuffer(char *pCpuStruct, int structAllocateSize)
//
// The hostside_opencl_funcs method setKernelArgHostsideBuffer expects to receive a host-side allocated struct/buffer
// setKernelArgHostsideBuffer will:
// - allocate a gpu buffer
// - queue an opencl command, to copy the host-side struct into the gpu buffer
// - add the clmem, and a zero-offset, to the list of kernel parameters
//
// Note that the setKernelArgHostsideBuffer method needs no information on the structure of the struct
//
// this patch_hostside addSetKernelArgInst_byvaluestruct function is going to handle walking the struct, and sending
// the other pointers through using additional method calls, likely to setKernelArgGpuBuffer
static llvm::Instruction *addSetKernelArgInst_byvaluestruct(
llvm::Instruction *lastInst, cocl::ParamInfo *paramInfo, llvm::Value *valueAsPointerInstr);
// this needs to do the same as addSetKernelArgInst_byvaluestruct , but it passes the struct pointer into the
// setKernelArgGpuBuffer function, rather than the setKernelArgHostsideBuffer
// but it still needs to walk the struct, at patching time, compile time, and add calls to pass any pointers
// in the struct through too
// hmmmm. actually. I think we'll forbid pointers in gpuside structs for now. unless we have to
// why? because, how are we going to get those pointers, if they're stored on the gpu :-P
// like, how are we going to clone it, first issue. Possible to to do, but a bunch of work, unless we have to
static llvm::Instruction *addSetKernelArgInst_pointerstruct(llvm::Instruction *lastInst, llvm::Value *structPointer);
static llvm::Instruction *addSetKernelArgInst_byvaluevector(llvm::Instruction *lastInst, llvm::Value *structPointer);
// all setKernelArgs pass through addSetKernelArgInst, which dispatches to other functions
static llvm::Instruction *addSetKernelArgInst(
llvm::Instruction *lastInst, ParamInfo *paramInfo);
static void getLaunchTypes(llvm::Module *M, const llvm::Module *MDevice, GenericCallInst *inst, LaunchCallInfo *info);
// given a bytecode that calls the cudaSetupArgument method, obtains information
// about this, such as the type of the argument being set, and an instruction that represents
// its value, stores that, in info, along with the arguments there already
static void getLaunchArgValue(GenericCallInst *inst, LaunchCallInfo *info, ParamInfo *paramInfo);
static void patchCudaLaunch(
llvm::Module *M, const llvm::Module *MDevice,
llvm::Function *F, GenericCallInst *inst, std::vector<llvm::Instruction *> &to_replace_with_zero);
static void patchFunction(llvm::Module *M, const llvm::Module *MDevice, llvm::Function *F); // patch all kernel launch commands in function F
static void patchModule(llvm::Module *M, const llvm::Module *MDevice); // main entry point. Scan through module M, and rewrite kernel launch commands
};
} // namespace cocl