In [6]:
from time import time

import numpy as np
import pandas as pd
import pyopencl as cl

import src.py.config as config
import src.py.dataset as dataset

In [2]:
%load_ext pyopencl.ipython_ext

In [418]:
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

Choose platform:
[0] <pyopencl.Platform 'Apple' at 0x7fff0000>
Choice [0]:0
Choose device(s):
[0] <pyopencl.Device 'Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz' on 'Apple' at 0xffffffff>
[1] <pyopencl.Device 'Iris Pro' on 'Apple' at 0x1024500>
[2] <pyopencl.Device 'AMD Radeon R9 M370X Compute Engine' on 'Apple' at 0x1021c00>
Choice, comma-separated [0]:3
Set the environment variable PYOPENCL_CTX='0:3' to avoid being asked again.


In [419]:
for device in ctx.get_info(cl.context_info.DEVICES):
  print("Device name:", device.name)
  print("Device type:", cl.device_type.to_string(device.type))
  print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
  print("Device max clock speed:", device.max_clock_frequency, 'MHz')
  print("Device compute units:", device.max_compute_units)

Device name: AMD Radeon R9 M370X Compute Engine
Device type: GPU
Device memory:  2048 MB
Device max clock speed: 80 MHz
Device compute units: 10


In [420]:
%%cl_kernel -o "-cl-fast-relaxed-math"

__kernel
void mdot(__global const uchar* a, 
          __global const float* b, 
          __global float* c,
          __global long* offsets,
          const int n, 
          const int p, 
          const int m,
          __local float* temp)
{
    int glid = get_global_id(0); 
    int pos = glid;
    long offset = offsets[glid];
    if(glid == 0){
        for(int i = 0; i < m; i++){
            temp[i] = 0;
        }
    }
    for(int i = 0; i < n; i++)
    {
        temp[i%50 /*+ a[pos]*/] += b[i];
        pos += p;
    }
    if(glid == 0){
        for(int i = 0; i < m; i++){
            c[i] = temp[i];
        }
    }
}



In [421]:
%%cl_kernel -o "-cl-fast-relaxed-math"

__kernel
void mdot2(__global const uchar* a, 
          __global const float* b, 
          __global float* c,
          __global int* columns,
          __global uchar* values,
          const int n, 
          const int p)
{
    int glid = get_global_id(0); 
    int pos = columns[glid];
    uchar v = values[glid];
    float r = 0;
    for(int i = 0; i < n; i++)
    {
        r += (a[pos] == v) ? b[i] : 0;
        pos += p;
    }
    c[glid] = r;
}

In [422]:
cfg = config.Config('./config/model_nbsin_ddea.cfg')
label = cfg.label
path = cfg.get_result_path()
ds = dataset.Dataset(cfg.get_dataset_filename())
ds.load()
includes = set(ds.features) - set(cfg.excludes)
includes = [ds.get_feature_index(i) for i in includes]
includes.sort()
n = ds.size
p = ds.count_features()
m = ds.count_modalities()
n, p, m

(4459542, 156, 1813)

In [423]:
X = dataset.load_data('./dataset/mrh_ddea/features.dat', 'uint8', (n, p))
Xt = dataset.load_data('./dataset/test/mt.dat', 'uint8', (p, n))
exposure = dataset.load_data('./dataset/mrh_ddea/column_anpol.dat', 'float32', (n,))
Y = dataset.load_data('./dataset/mrh_ddea/column_nbsinDDE.dat', 'float32', (n,))

columns = np.zeros(m, 'int32')
values = np.zeros(m, 'uint8')

k = 0
for i in range(p):
    f = ds.features[i]
    for j in range(len(ds.get_modalities(f))):
        columns[k] = i
        values[k] = j
        k += 1

R = np.zeros(m).astype(np.float32)

X_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=X)
Y_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=Y)
R_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, R.nbytes)
Offset_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ds.get_offsets())
Columns_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=columns)
Values_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values)
Temp_buf = cl.LocalMemory(np.zeros(m).nbytes)

mdot.set_args(X_buf, Y_buf, R_buf, Offset_buf, np.int32(n), np.int32(p), np.int32(m), Temp_buf)
mdot2.set_args(X_buf, Y_buf, R_buf, Columns_buf, Values_buf, np.int32(n), np.int32(p))

In [424]:
ev = cl.enqueue_nd_range_kernel(queue, mdot2, (p,), None)
ev.wait()
gpu_start_time = time()  # Get the GPU start time
ev = cl.enqueue_nd_range_kernel(queue, mdot2, (m,), None)
cl.enqueue_read_buffer(queue, R_buf, R)
queue.finish()
gpu_end_time = time()  # Get the GPU end time
elapsed = 1e-9 * (ev.profile.end - ev.profile.start)  # Calculate the time it took to execute the kernel
print("GPU Kernel Time: {0} s".format(elapsed))  # Print the time it took to execute the kernel
print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))  # Print the time the GPU program took, including 
                                                                #both memory copies
print((n * p)/(gpu_end_time - gpu_start_time) / (1024 * 1024 * 1024), "GFlops")

  """


GPU Kernel Time: 3.738766972 s
GPU Time: 3.7440741062164307 s
0.17304958048330388 GFlops


In [425]:
for i in range(p):
    print(i, ds.features[i], Y[X[:, i] == 1].sum() - R[ds.get_offsets()[i] + 1])

0 POL_mtcapass 0.0
1 EXT_superficie_m2_num 0.0
2 EXT_dens_pop 0.0
3 EXT_part_respr_1p 0.0
4 EXT_part_respr_5p 0.0
5 EXT_part_secocc 0.0
6 EXT_part_surf_inf40 0.0
7 EXT_part_surf_plus100 0.0
8 EXT_part_rev_15K29k 0.0
9 EXT_part_men_lochlm 0.0
10 EXT_part_men_pror 0.0
11 EXT_part_pop_18a24a 0.0
12 EXT_part_pop_65a79a 0.0
13 EXT_part_respr_chauf_aut 0.0
14 EXT_part_respr_chauf_col 0.0
15 EXT_part_respr_chauf_ind 0.0
16 EXT_part_respr_chauf_indelec 0.0
17 EXT_part_repsr_sdb1 0.0
18 EXT_part_repsr_sdb2 0.0
19 EXT_part_lgt_co 0.0
20 EXT_part_lgt_vac 0.0
21 EXT_part_pop_15dipsup 0.0
22 EXT_part_pop_15ssdip 0.0
23 EXT_gdeville 0.0
24 AUTV_prix_sra_max 0.0
25 AUTV_prix_sra_min 0.0
26 AUTV_rm 0.0
27 AUTV_segment_max 0.0
28 AUTV_segment_min 0.0
29 CLI_banque 0.0
30 POL_cdopmrh 0.0
31 POL_fract 0.0
32 POL_cdregion 0.0
33 HAB_cdresid 0.0
34 CLI_sex 0.0
35 POL_distrib 0.0
36 AUTV_energie_es 0.0
37 AUTV_energie_go 0.0
38 AUTV_k8000_n 0.0
39 AUTV_k8000_o 0.0
40 AUTV_prix_sra_ad 0.0
41 AUTV_prix_sra_ef

In [313]:
%%timeit
Y[X[:, 1] == 1].sum()

44.9 ms ± 2.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
cpu_start_time = time()  # Get the GPU start time
D = np.dot(A, B)
cpu_end_time = time()  # Get the GPU end time
print("CPU Time: {0} s".format(cpu_end_time - cpu_start_time))  
print((n * p * 2)/(cpu_end_time - cpu_start_time) / (1024 * 1024 * 1024), "GFlops")

CPU Time: 0.008141040802001953 s
6.8639079247935335 GFlops


In [31]:
C

array([24.567423, 22.538635, 23.54259 , ..., 25.078547, 25.717302,
       26.456945], dtype=float32)

In [123]:
cl.get_platforms()[0].get_devices(cl.device_type.ALL)[2].max_work_group_size #max_work_item_dimensions

256

In [32]:
np.dot(A, B)

array([24.567429, 22.538643, 23.542585, ..., 25.07854 , 25.717293,
       26.456944], dtype=float32)

In [33]:
(cpu_end_time - cpu_start_time) / (gpu_end_time - gpu_start_time)

0.23217041877163042

In [301]:
C

array([28.689331, 26.380375, 26.65357 , ..., 23.372551, 26.52175 ,
       27.61783 ], dtype=float32)

In [302]:
D

array([28.689327, 26.380373, 26.653566, ..., 23.372551, 26.52174 ,
       27.617823], dtype=float32)

In [303]:
import pyclblast

from pyopencl.array import Array

In [304]:
# Settings for this sample
dtype = 'float32'
alpha = 1
beta = 0

print("# Setting up Numpy arrays")
y = np.random.rand(n).astype(dtype=dtype)

print("# Setting up OpenCL arrays")
cla = Array(queue, A.shape, A.dtype)
clx = Array(queue, B.shape, B.dtype)
cly = Array(queue, y.shape, y.dtype)
cla.set(A)
clx.set(x)
cly.set(y)

# Setting up Numpy arrays
# Setting up OpenCL arrays


In [305]:
pyclblast.gemv(queue, n, p, cla, clx, cly, a_ld=p, alpha=alpha, beta=beta)
queue.finish()
print("# Result for vector y: %s" % cly.get())
#print("# Expected result:     %s" % (alpha * np.dot(A, B) + beta * y))

# Result for vector y: [27.239933 25.933067 31.403833 ... 24.658667 28.381613 26.393766]


In [306]:
(alpha * np.dot(A, B) + beta * y)

array([28.689327, 26.380373, 26.653566, ..., 23.372551, 26.52174 ,
       27.617823], dtype=float32)