# Lesson 2. Computational kernels and convolution operation

In [None]:
# Import the required modules
import cv2
from IPython.display import clear_output
import time
import PIL.Image
from io import BytesIO
import IPython.display
import numpy as np
from numba import cuda

In [None]:
#Use 'jpeg' instead of 'png' (~5 times faster)
def array_to_image(a, fmt='jpeg'):
    #Create binary stream object
    f = BytesIO()
    
    #Convert array to binary stream object
    PIL.Image.fromarray(a).save(f, fmt)
    
    return IPython.display.Image(data=f.getvalue())

In [None]:
def get_frame(cam):
    # Capture frame-by-frame
    ret, frame = cam.read()
    
    #flip image for natural viewing
    frame = cv2.flip(frame, 1)
    
    return frame

In [None]:
@cuda.jit
def blur_kernel(x, out):
    blur_size = 3
    i,j = cuda.grid(2)
    
    if i < x.shape[0] and j < x.shape[1]:
        for c in range(0, 3):
            out[i,j,c] = 0
        count = 0
        for ii in range(-blur_size,blur_size+1):
            for jj in range(-blur_size,blur_size+1):
                if (i+ii)>-blur_size and (i+ii)<x.shape[0]-blur_size+1 and (j+jj)>-blur_size and (j+jj)<x.shape[1]-blur_size+1:
                    for c in range(0, 3):
                        out[i,j,c] += x[i+ii,j+jj,c]
                    count = count + 1
        for c in range(0, 3):
            out[i,j,c] = out[i,j,c]/count
    return

In [None]:
@cuda.jit
def bw_kernel(x, out):
    i,j = cuda.grid(2)
    if i < x.shape[0] and j < x.shape[1]:
        out[i,j] = 0.2126*x[i,j,0] + 0.7152*x[i,j,1] + 0.0722*x[i,j,2]
    return

In [None]:
@cuda.jit
def conv_kernel(x, kernel, out):
    kernel_size = (kernel.shape[0]-1)//2
    i,j = cuda.grid(2)
    
    if i < x.shape[0] and j < x.shape[1]:
        out[i,j] = 0

        for ii in range(-kernel_size,kernel_size+1):
            for jj in range(-kernel_size,kernel_size+1):
                if (i+ii)>-kernel_size and (i+ii)<x.shape[0]-kernel_size+1 and (j+jj)>-kernel_size and (j+jj)<x.shape[1]-kernel_size+1:
                    out[i,j] += x[i+ii,j+jj] * kernel[ii+kernel_size,jj+kernel_size]
    return

In [None]:
@cuda.jit
def max_pool(x, out):
    i,j = cuda.grid(2)
    
    if i < out.shape[0] and j < out.shape[1]:
            
        max_brightness = 0

        for ii in range(0,6):
            for jj in range(0,6):
                if (x[5*i+ii,5*j+jj] > max_brightness):
                    max_brightness = x[5*i+ii,5*j+jj]
        
        out[i,j] = max_brightness
    return  

In [None]:
#Try different convolution kernels
gauss_blur = np.array([[1,2,1],[2,4,2],[1,2,1]])/16.0
edge_detector_1 = np.array([[1,0,-1],[0,0,0],[-1,0,1]])
edge_detector_2 = np.array([[0,1,0],[1,-4,1],[0,1,0]])
edge_detector_3 = np.array([[-1,-1,-1],[-1,8,-1],[-1,-1,-1]])
sharpen = np.array([[0,-1,0],[-1,5,-1],[0,-1,0]])

In [None]:
d_conv_kernel_1 = cuda.to_device(edge_detector_1)
d_conv_kernel_2 = cuda.to_device(edge_detector_2)
d_conv_kernel_3 = cuda.to_device(edge_detector_3)
cam = cv2.VideoCapture(0)
frame = get_frame(cam)
d_out = cuda.device_array(frame.shape[0:2])
d_out_1 = cuda.device_array(frame.shape[0:2])
d_out_2 = cuda.device_array(frame.shape[0:2])
d_out_3 = cuda.device_array(frame.shape[0:2])
d_out_pool_1 = cuda.device_array( tuple(x//5 for x in frame.shape[0:2]) )
d_out_pool_2 = cuda.device_array( tuple(x//5 for x in frame.shape[0:2]) )
d_out_pool_3 = cuda.device_array( tuple(x//5 for x in frame.shape[0:2]) )


d = IPython.display.display("", display_id=1)
d2 = IPython.display.display("", display_id=2)
while True:
    try:
        t1 = time.time()
        
        # Capture frame-by-frame
        frame = get_frame(cam)
        
        # Convert the image from OpenCV BGR format to matplotlib RGB format
        # to display the image
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = frame.astype(np.uint32)
        
        # Send to GPU
        d_frame = cuda.to_device(frame)
        
        #Run kernel
        griddim = 30, 40
        blockdim = 16, 16
        
        bw_kernel[griddim, blockdim](d_frame, d_out)
        conv_kernel[griddim, blockdim](d_out, d_conv_kernel_1, d_out_1)
        conv_kernel[griddim, blockdim](d_out, d_conv_kernel_2, d_out_2)
        conv_kernel[griddim, blockdim](d_out, d_conv_kernel_3, d_out_3)
        
        #Apply max_pooling layer
        max_pool[griddim, blockdim](d_out_1, d_out_pool_1)
        max_pool[griddim, blockdim](d_out_2, d_out_pool_2)
        max_pool[griddim, blockdim](d_out_3, d_out_pool_3)
        
        #Send back to CPU
        frame_GPU_1 = d_out_pool_1.copy_to_host().astype(np.uint8)
        frame_GPU_2 = d_out_pool_2.copy_to_host().astype(np.uint8)
        frame_GPU_3 = d_out_pool_3.copy_to_host().astype(np.uint8)
        
        #Concatenate three resulting layers
        frame_GPU = np.concatenate((frame_GPU_1, frame_GPU_2, frame_GPU_3), axis=1)
    
        im = array_to_image(frame_GPU)
        
        d.update(im)
        
        t2 = time.time()
        
        s = f"""{int(1/(t2-t1))} FPS"""
        d2.update( IPython.display.HTML(s) )
    except KeyboardInterrupt:
        print()
        cam.release()
        IPython.display.clear_output()
        print ("Stream stopped")
        break