### Similarity metrics of words in vector space model using rapids and numba on GPU

In [1]:
import cudf
import numpy as np
from numba import cuda, float32
import math

Download the glove vectors

In [2]:
# download glove vectors from web
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip    

In [3]:
!head -1 glove.6B.50d.txt

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581


In [4]:
def name_dtype_gen(dim_size=50):
    names = ['word']
    dtypes = ['str']
    for i in range(dim_size):
        names.append('dim_'+str(i+1))
        dtypes.append(np.float64)
    return names,dtypes

In [5]:
names, dtypes = name_dtype_gen(50)

Reading the word vectors using `cudf.read_csv()` and then processing on GPU using numba. 

In [6]:
%time df = cudf.read_csv("glove.6B.50d.txt", delim_whitespace=True, names=names, quoting=3)  #ignore quoting 

CPU times: user 353 ms, sys: 765 ms, total: 1.12 s
Wall time: 1.13 s


In [7]:
print(df)

    word                dim_1                dim_2                dim_3                 dim_4                dim_5                 dim_6 ...                 dim_50
0   the  0.41800000000000004              0.24968 -0.41242000000000006   0.12170000000000002              0.34527 -0.044456999999999997 ...               -0.78581
1     ,             0.013441              0.23682 -0.16899000000000003   0.40951000000000004   0.6381200000000001               0.47709 ...     0.3039200000000001
2     .              0.15164  0.30177000000000004             -0.16763   0.17684000000000002              0.31719   0.33973000000000003 ...    0.10216000000000001
3    of              0.70853   0.5708799999999999              -0.4716               0.18048              0.54449    0.7260300000000001 ...    -0.8037500000000001
4    to              0.68047            -0.039263   0.3018600000000001  -0.17792000000000002  0.42962000000000006              0.032246 ...               -0.26044
5   and  0.2681800000

In [8]:
print(df.shape)

(400000, 51)


In [9]:
mappings = df['word']

In [10]:
mappings.shape

(400000,)

In [11]:
%time df.drop_column('word')

CPU times: user 32 µs, sys: 22 µs, total: 54 µs
Wall time: 62.2 µs


In [12]:
%time mat = df.as_gpu_matrix()

CPU times: user 37.7 ms, sys: 2.3 ms, total: 40 ms
Wall time: 38.7 ms


In [13]:
print(mat.shape)
print(mat.dtype)
print(mappings)

(400000, 50)
float64
0    the
1      ,
2      .
3     of
4     to
5    and
6     in
7      a
8      "
9     's
[399990 more rows]
Name: word, dtype: object


---
Numba cuda function for calculating the `ecludean distance` between two vectors.

In [14]:
@cuda.jit(device=True)
def ecludean_dist(a,b, dim_size):
    summ = 0
    for i in range(dim_size):
        summ += ((a[i]-b[i])**2)
    return math.sqrt(summ)

Numba cuda function for calculating the `dot product` of two vectors

In [15]:
@cuda.jit(device=True)
def dot(a, b, dim_size):
    summ = 0
    for i in range(dim_size):
        summ += (a[i]*b[i])
    return summ

Numba function for calculating the `cosine similarity` between two vectors

In [16]:
@cuda.jit(device=True)
def cosine_sim(a, b, dim_size):
    return dot(a,b, dim_size) / ( math.sqrt(dot(a, a, dim_size)) * math.sqrt(dot(b, b, dim_size)) )

Numba cuda kernel for finding the nearest point using:
* ecludean distance
* cosine similarity

In [17]:
@cuda.jit('void(float64[:,:], int32[:], int32[:], int32, int32)')
def find_nearest(mat, out_1, out_2, dim_size, n):
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if idx >= n:
        return
    e = 9999999.0
    e_i = idx
    
    c = -1.0 
    c_i = idx
    
    # here is room for improvement using shared memory 
    for i in range(n):
        if i == idx:
            continue
        dist = ecludean_dist(mat[idx], mat[i], dim_size)
        csim = cosine_sim(mat[idx], mat[i], dim_size)
        if dist <= e:
            e_i = i
            e = dist
        if csim >= c:
            c_i = i
            c = csim
    
    out_1[idx] = e_i
    out_2[idx] = c_i

Configuration parameters for invoking the kernel on gpu:
* defining threads per block (`tpb`)
* computing block per grid (`bpg`) by dividing the `#words` by `tpb`


In [18]:
n = mat.shape[0]
dim_size = mat.shape[1]
device = cuda.get_current_device()

tpb = device.WARP_SIZE    #blocksize or thread per block
bpg = int(np.ceil((n)/tpb))  # block per grid
(tpb, bpg)

(32, 12500)

out arrays to store the indices of nearest points using both the approaches.

In [19]:
out_1 = cuda.device_array(shape=n, dtype=np.int32)
out_2 = cuda.device_array(shape=n, dtype=np.int32)

In [20]:
%%time
find_nearest[bpg,tpb](mat, out_1, out_2, dim_size, n)
cuda.synchronize()

CPU times: user 1min 18s, sys: 3min 3s, total: 4min 22s
Wall time: 4min 23s


In [20]:
#out_1.copy_to_host()

In [21]:
#out_2.copy_to_host()

In [21]:
result_df = cudf.DataFrame({'word':mappings})
result_df['ecludean'] = mappings.iloc[out_1]
result_df['cosine']= mappings.iloc[out_2]

In [22]:
result_df.to_pandas()

Unnamed: 0,word,ecludean,cosine
0,the,which,which
1,",",.,.
2,.,same,same
3,of,which,which
4,to,take,take
5,and,well,well
6,in,from,from
7,a,an,another
8,"""",“,“
9,'s,has,has
