/
test_vadd.c
206 lines (183 loc) · 5.73 KB
/
test_vadd.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#ifdef __cilk
#include <cilk/cilk.h>
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#if USE_GFX
#include <gfx/gfx_rt.h>
#endif
#define RESTRICT
double vdiff(int n, const float * RESTRICT a, const float * RESTRICT b)
{
double d = 0.0;
for(int i = 0; i < n; i++) {
d += (a[i] - b[i]);
}
return d;
}
void vadd0(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
for(int i = 0; i < n; i++)
c[i] = a[i] + b[i];
}
void vadd1(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
#if defined(_OPENMP) && (_OPENMP >= 201307)
#pragma omp parallel for simd
#elif defined(_OPENMP)
#warning No OpenMP simd support!
#pragma omp parallel for
#else
#warning No OpenMP support!
#endif
for(int i = 0; i < n; i++)
c[i] = a[i] + b[i];
}
void vadd2(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
#if defined(_OPENMP) && (_OPENMP >= 201307)
#pragma omp target map(to:n,a[0:n],b[0:n]) map(from:c[0:n])
# if defined(__INTEL_COMPILER) && defined(__INTEL_OFFLOAD)
#pragma omp parallel for simd
# else
#pragma omp teams distribute parallel for simd
# endif
#else
#warning No OpenMP target/simd support!
#pragma omp parallel for
#endif
for(int i = 0; i < n; i++)
c[i] = a[i] + b[i];
}
void vadd3(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
#ifdef __cilk
_Cilk_for(int i = 0; i < n; i++)
#else
#warning No Cilk support. Using sequential for loop.
for(int i = 0; i < n; i++)
#endif
c[i] = a[i] + b[i];
}
void vadd4(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
#ifdef __cilk
#if defined(__INTEL_COMPILER) && defined(__INTEL_OFFLOAD)
#pragma offload target(gfx) in(a,b : length(n)) out(c : length(n)) //pin(a, b, c : length(n))
#else
#warning No Cilk offload support!
#endif
_Cilk_for(int i = 0; i < n; i++)
#else
#warning No Cilk support. Using sequential for loop.
for(int i = 0; i < n; i++)
#endif
c[i] = a[i] + b[i];
}
#if USE_GFX
/* The following works for Linux but not Windows. */
__attribute__((target(gfx_kernel)))
void gfx_vadd5(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
_Cilk_for(int i = 0; i < n; i++)
c[i] = a[i] + b[i];
}
void vadd5(int n, float * RESTRICT a, float * RESTRICT b, float * RESTRICT c)
{
int rc = 0;
rc = _GFX_share(a, sizeof(float)*n);
if (rc) printf("_GFX_share returned %#06x\n", -_GFX_get_last_error());
rc = _GFX_share(b, sizeof(float)*n);
if (rc) printf("_GFX_share returned %#06x\n", -_GFX_get_last_error());
rc = _GFX_share(c, sizeof(float)*n);
if (rc) printf("_GFX_share returned %#06x\n", -_GFX_get_last_error());
GfxTaskId id = _GFX_offload("gfx_vadd5", a, b, c, n);
rc = _GFX_wait(id,1e9);
if (rc) printf("_GFX_wait returned %#06x\n", -_GFX_get_last_error());
rc = _GFX_unshare(a);
if (rc) printf("_GFX_unshare returned %#06x\n", -_GFX_get_last_error());
rc = _GFX_unshare(b);
if (rc) printf("_GFX_unshare returned %#06x\n", -_GFX_get_last_error());
rc = _GFX_unshare(c);
if (rc) printf("_GFX_unshare returned %#06x\n", -_GFX_get_last_error());
}
#endif /* USE_GFX */
int main(int argc, char * argv[])
{
int n = (argc > 1 ) ? atoi(argv[1]) : 1000;
float * x = calloc(n,sizeof(float)); assert(x !=NULL);
float * y = calloc(n,sizeof(float)); assert(y !=NULL);
float * z0 = calloc(n,sizeof(float)); assert(z0!=NULL);
float * z1 = calloc(n,sizeof(float)); assert(z1!=NULL);
float * z2 = calloc(n,sizeof(float)); assert(z2!=NULL);
float * z3 = calloc(n,sizeof(float)); assert(z3!=NULL);
float * z4 = calloc(n,sizeof(float)); assert(z4!=NULL);
#if USE_GFX
float * z5 = calloc(n,sizeof(float)); assert(z5!=NULL);
#endif
#if 0 && defined(_OPENMP) && (_OPENMP >= 201307)
int nthrd = omp_get_max_threads();
int ndevs = omp_get_num_devices();
printf("OpenMP threads = %d devices = %d\n", nthrd, ndevs);
#endif
for (int i=0; i<n; i++) {
x[i] = (float)i;
}
for (int i=0; i<n; i++) {
y[i] = (float)i;
}
for (int iter=0; iter<10; iter++) {
double t0 = omp_get_wtime();
vadd0(n,x,y,z0);
double t1 = omp_get_wtime();
vadd1(n,x,y,z1);
double t2 = omp_get_wtime();
vadd2(n,x,y,z2);
double t3 = omp_get_wtime();
vadd3(n,x,y,z3);
double t4 = omp_get_wtime();
vadd4(n,x,y,z4);
double t5 = omp_get_wtime();
#if USE_GFX
vadd5(n,x,y,z5);
double t6 = omp_get_wtime();
#endif
printf("%20s time = %lf \n", "for", t1-t0);
printf("%20s time = %lf (error=%lf) \n", "OpenMP for", t2-t1, vdiff(n,z0,z1));
printf("%20s time = %lf (error=%lf) \n", "OpenMP offload for", t3-t2, vdiff(n,z0,z2));
#ifdef __cilk
printf("%20s time = %lf (error=%lf) \n", "_Cilk_for", t4-t3, vdiff(n,z0,z3));
printf("%20s time = %lf (error=%lf) \n", "offload _Cilk_for", t5-t4, vdiff(n,z0,z4));
#endif
#if USE_GFX
printf("%20s time = %lf (error=%lf) \n", "GFX RT offload _Cilk_for", t6-t5, vdiff(n,z0,z5));
#if 0
for (int i=0; i<n; i++) {
printf("%d z0=%f z5=%f\n", i, z0[i], z5[i]);
}
#endif
#endif
/* prevent compiler from optimizing away anything */
double junk = t0+t1+t2+t3+t4+t5;
for (int i=0; i<n; i++) {
junk += z0[i] + z1[i] + z2[i] + z3[i] + z4[i]; // + z5[i];
}
printf("junk=%lf\n", junk);
}
#if USE_GFX
free(z5);
#endif
free(z4);
free(z3);
free(z2);
free(z1);
free(z0);
free(y);
free(x);
printf("Success\n");
return 0;
}