In [1]:
# !pip install pyopencl
# !pip install pytorch
import pyopencl as cl
import numpy as np
import timeit
# # !pip3 install torch torchvision torchaudio
import torch

In [2]:
def sequential(a,res):
    for i in range(len(a)):
        res[i] = a[i] * a[i]
    return res

def parallel(a,res):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, res.nbytes)
    prg = cl.Program(ctx, """
        __kernel void sq(__global const float *a,
        __global float *c)
        {
        int gid = get_global_id(0);
        c[gid] = a[gid] * a[gid];
        }
        """).build()

    prg.sq(queue, a.shape, None, a_buf, dest_buf)
    cl.enqueue_copy(queue, res, dest_buf)
    return res

In [3]:
N = 10**7
a = np.random.random(N)
res = np.zeros(N)
# a = np.arange(N).astype(np.float32)
# res = np.empty_like(a)
print(a,res,len(a)==len(res))

# gc off 
tx = timeit.Timer("sequential(a,res)",globals=globals()).timeit(number=3)
ty = timeit.Timer("parallel(a,res)",globals=globals()).timeit(number=3)
print(tx,ty)

# gc on
import gc
setup = "gc.enable()"
tx = timeit.timeit("sequential(a,res)",setup,globals=globals(),number=3)
ty = timeit.timeit("parallel(a,res)",setup,globals=globals(),number=3)
print(tx,ty)

[0.77429472 0.34442607 0.68356206 ... 0.30644037 0.65627373 0.09778403] [0. 0. 0. ... 0. 0. 0.] True
10.372143099999999 0.39606540000000123
9.402723599999998 0.11414449999999476
