### Similarity metrics of words in vector space model using rapids and numba on GPU

In [None]:
import cudf
import numpy as np
from numba import cuda, float32
import math

Download the glove vectors

In [None]:
# download glove vectors from web
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip    

In [2]:
def name_dtype_gen(dim_size=50):
    names = ['word']
    dtypes = ['str']
    for i in range(dim_size):
        names.append('dim_'+str(i+1))
        dtypes.append(np.float64)
    return names,dtypes

In [3]:
names, dtypes = name_dtype_gen(50)

Reading the word vectors using `cudf.read_csv()` and then processing on GPU using numba. 

In [4]:
df = cudf.read_csv("glove.6B.50d.txt", delim_whitespace=True, names=names, quoting=3)  #ignore quoting 

In [5]:
print(df)

    word                dim_1                dim_2                dim_3                 dim_4                dim_5                 dim_6 ...                 dim_50
0   the  0.41800000000000004              0.24968 -0.41242000000000006   0.12170000000000002              0.34527 -0.044456999999999997 ...               -0.78581
1     ,             0.013441              0.23682 -0.16899000000000003   0.40951000000000004   0.6381200000000001               0.47709 ...     0.3039200000000001
2     .              0.15164  0.30177000000000004             -0.16763   0.17684000000000002              0.31719   0.33973000000000003 ...    0.10216000000000001
3    of              0.70853   0.5708799999999999              -0.4716               0.18048              0.54449    0.7260300000000001 ...    -0.8037500000000001
4    to              0.68047            -0.039263   0.3018600000000001  -0.17792000000000002  0.42962000000000006              0.032246 ...               -0.26044
5   and  0.2681800000

In [6]:
print(df.shape)

(400000, 51)


In [7]:
mappings = df['word']

In [8]:
mappings.shape

(400000,)

In [9]:
df.drop_column('word')

In [10]:
mat = df.as_gpu_matrix()

In [11]:
print(mat.shape)
print(mat.dtype)
print(mappings)

(400000, 50)
float64
0    the
1      ,
2      .
3     of
4     to
5    and
6     in
7      a
8      "
9     's
[399990 more rows]
Name: word, dtype: object


---
Numba cuda function for calculating the `ecludean distance` between two vectors.

In [12]:
@cuda.jit(device=True)
def ecludean_dist(a,b, dim_size):
    summ = 0
    for i in range(dim_size):
        summ += ((a[i]-b[i])**2)
    return math.sqrt(summ)

Numba cuda function for calculating the `dot product` of two vectors

In [13]:
@cuda.jit(device=True)
def dot(a, b, dim_size):
    summ = 0
    for i in range(dim_size):
        summ += (a[i]*b[i])
    return summ

Numba function for calculating the `cosine similarity` between two vectors

In [14]:
@cuda.jit(device=True)
def cosine_sim(a, b, dim_size):
    return dot(a,b, dim_size) / ( math.sqrt(dot(a, a, dim_size)) * math.sqrt(dot(b, b, dim_size)) )

Numba cuda kernel for finding the nearest point using:
* ecludean distance
* cosine similarity

In [15]:
@cuda.jit('void(float64[:,:], int32[:], int32[:], int32, int32)')
def find_nearest(mat, out_1, out_2, dim_size, n):
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if idx >= n:
        return
    e = 9999999.0
    e_i = idx
    
    c = -1.0 
    c_i = idx
    
    # here is room for improvement using shared memory 
    for i in range(n):
        if i == idx:
            continue
        dist = ecludean_dist(mat[idx], mat[i], dim_size)
        csim = cosine_sim(mat[idx], mat[i], dim_size)
        if dist <= e:
            e_i = i
            e = dist
        if csim >= c:
            c_i = i
            c = csim
    
    out_1[idx] = e_i
    out_2[idx] = c_i

configuration parameters for invoking the kernel on gpu

In [16]:
n = mat.shape[0]
dim_size = mat.shape[1]
device = cuda.get_current_device()

tpb = device.WARP_SIZE    #blocksize or thread per block
bpg = int(np.ceil((n)/tpb))  # block per grid
(tpb, bpg)

(32, 12500)

out arrays to store the indices of nearest points using both the approaches.

In [17]:
out_1 = cuda.device_array(shape=n, dtype=np.int32)
out_2 = cuda.device_array(shape=n, dtype=np.int32)

In [18]:
%%time
find_nearest[bpg,tpb](mat, out_1, out_2, dim_size, n)
cuda.synchronize()

CPU times: user 1min 25s, sys: 2min 57s, total: 4min 23s
Wall time: 4min 23s


In [19]:
out_1.copy_to_host()

array([    42,      2,    215, ..., 394441, 390139, 304408], dtype=int32)

In [20]:
out_2.copy_to_host()

array([    42,      2,    215, ..., 395706, 186361, 304408], dtype=int32)

In [23]:
print(mappings, mappings.iloc[out_1])

0    the
1      ,
2      .
3     of
4     to
5    and
6     in
7      a
8      "
9     's
[399990 more rows]
Name: word, dtype: object 42    which
2        .
215     same
42    which
190     take
143     well
25     from
29       an
1408        “
31      has
[399990 more rows]
Name: word, dtype: object


In [24]:
print(mappings, mappings.iloc[out_2])

0    the
1      ,
2      .
3     of
4     to
5    and
6     in
7      a
8      "
9     's
[399990 more rows]
Name: word, dtype: object 42      which
2          .
215       same
42      which
190       take
143       well
25       from
170    another
1408          “
31        has
[399990 more rows]
Name: word, dtype: object
