In [1]:
import nvstrings
import nvcategory
import cudf
import numpy as np
from numba import cuda, float32
import ctypes
import math
import cupy
import time 

from torch import nn, optim
from torch.autograd import Variable
import torch
from torch.utils.data import TensorDataset, DataLoader
torch.cuda.is_available()

True

In [2]:
pre_df = cudf.read_csv("datasets/glove.6B.50d.txt", header=None, delim_whitespace=True, quoting=3)  #ignore quoting

mappings = pre_df['0']

pre_df.drop_column('0')
for c in pre_df.columns:
    pre_df[c] = pre_df[c].astype(np.float32)
mat = pre_df.as_gpu_matrix()

print(mat.shape)
print(mat.dtype)
print(mappings)


(400000, 50)
float32
0    the
1      ,
2      .
3     of
4     to
5    and
6     in
7      a
8      "
9     's
[399990 more rows]
Name: 0, dtype: object


In [3]:
@cuda.jit(device=True)
def dot(a, b, dim_size):
    summ = 0
    for i in range(dim_size):
        summ += (a[i]*b[i])
    return summ

@cuda.jit(device=True)
def cosine_sim(a, b, dim_size):
    return dot(a,b, dim_size) / ( math.sqrt(dot(a, a, dim_size)) * math.sqrt(dot(b, b, dim_size)) )

@cuda.jit('void(float32[:,:], int32[:], int32, int32)')
def find_nearest(mat, out, dim_size, n):
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if idx >= n:
        return
    c = -1.0 
    c_i = idx
    # here is room for improvement using shared memory 
    for i in range(n):
        if i == idx:
            continue
        csim = cosine_sim(mat[idx], mat[i], dim_size)
        if csim >= c:
            c_i = i
            c = csim
    
    out[idx] = c_i


In [4]:
n = mat.shape[0]
dim_size = mat.shape[1]
device = cuda.get_current_device()

tpb = 128 #device.WARP_SIZE    #blocksize or thread per block
bpg = int(np.ceil((n)/tpb))  # block per grid
print( "kernel launch configuraion: ", tpb, bpg)

out = cuda.device_array(shape=n, dtype=np.int32)

st = time.time()
find_nearest[bpg,tpb](mat, out, dim_size, n)
cuda.synchronize()

print("time taken {} mins".format((time.time()-st)/60))

result_df = cudf.DataFrame({'word':mappings})
result_df['nearest']= mappings.iloc[out]

result_df.head(20).to_pandas()


kernel launch configuraion:  128 3125
time taken 4.021586120128632 mins


Unnamed: 0,word,nearest
0,the,which
1,",",.
2,.,same
3,of,which
4,to,take
5,and,well
6,in,from
7,a,another
8,"""",“
9,'s,has


In [5]:
# del result_df, mat, out
pre_df['0'] = mappings


In [6]:
def clean_sents(gstr):
    gstr = gstr.replace(r"[^A-Za-z0-9(),!?\'\`]", " ")
    gstr = gstr.replace(r"\'s", " \'s")
    gstr = gstr.replace(r"\'ve", " \'ve")
    gstr = gstr.replace(r"n\'t", " n\'t")
    gstr = gstr.replace(r"\'re", " \'re")
    gstr = gstr.replace(r"\'d", " \'d")
    gstr = gstr.replace(r"\'ll", " \'ll")
    gstr = gstr.replace(r",", " , ")
    gstr = gstr.replace(r"!", " ! ")
    gstr = gstr.replace(r"\(", " \( ")
    gstr = gstr.replace(r"\)", " \) ")
    gstr = gstr.replace(r"\?", " \? ")
    gstr = gstr.replace(r"\s{2,}", " ")
    return gstr.strip().lower()


In [7]:
sents = cudf.read_csv("/a0k00jd/datasets/train.csv", quoting=3, skiprows=1, names=['review', 'label'])
y_train = sents['label'].astype('float32').to_gpu_array()
gstr = sents['review'].data


In [8]:
MAX_LEN = 20
num_sents = gstr.size()
gstr = clean_sents(gstr)

#generate the tokens
seq = gstr.split_record(' ')

for i in range(len(seq)):
    l = seq[i].size()
    seq[i] = seq[i].add_strings(nvstrings.to_device((MAX_LEN-l)*['PAD'])) if l <=MAX_LEN else seq[i].remove_strings(list(range(MAX_LEN,l)))

#generating the indices corresponding each token 
c = nvcategory.from_strings_list(seq)
print(len(c.values()))
print(len(c.keys()))


20000
2707


In [9]:
# generating unique tokens 
# print(c.keys())
sent_df = cudf.DataFrame({'tokens':c.keys()})
print(sent_df)

# preparing the X_train 
X_train = cuda.device_array((num_sents, MAX_LEN), dtype=np.float32)
c.values(X_train.device_ctypes_pointer.value)
print(X_train.shape)



      tokens
0         !
1    'cover
2        'd
3    'film'
4       'll
5    'must'
6       're
7        's
8       'so
9  'stagey'
[2697 more rows]
(1000, 20)


In [10]:
vocab_df = sent_df.merge(pre_df,left_on='tokens', right_on='0', how='left')
vocab_df.drop_column('0')
vocab_df.drop_column('tokens')

all_token = vocab_df.shape[0]
print(all_token)
#calculating the number of tken not found in GloVe 
not_found = vocab_df['1'].null_count
print(not_found)

# filling the not found tokens with random vector, [now with -1]
for c in vocab_df.columns:
    vocab_df[c] = vocab_df[c].fillna(cupy.random.normal(size=all_token)).astype(np.float32)
vocab = vocab_df.as_gpu_matrix(order='C')


2707
50


In [11]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    # emb_layer.load_state_dict({'weight': weights_matrix})
    emb_layer.weight = nn.Parameter(weights_matrix)
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class ToyLSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_size, output_size, num_layers):
        super(ToyLSTM, self).__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
         
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, hidden_size//2)
        self.out = nn.Linear(hidden_size//2, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, inp):
        h_embedding = self.embedding(inp) 
        h_lstm, _ = self.lstm(h_embedding)
        max_pool, _ = torch.max(h_lstm, 1)
        linear = self.relu(self.linear(max_pool)) 
        out = self.out(linear) 
        return out



In [12]:
def devndarray2tensor(dev_arr, dtyp='float32'):
    dmap = {'float32':torch.float32, 'long':torch.long}
    t = torch.empty(size=dev_arr.shape, dtype=dmap[dtyp]).cuda()
    ctx = cuda.cudadrv.driver.driver.get_context()
    
    # constant value of #bytes in float32 = 4
    mp = cuda.cudadrv.driver.MemoryPointer(ctx, ctypes.c_ulong(t.data_ptr()), t.numel()*4)
    tmp_arr = cuda.cudadrv.devicearray.DeviceNDArray(t.size(), [i*4 for i in t.stride()], np.dtype(dtyp), 
                                            gpu_data=mp, stream=torch.cuda.current_stream().cuda_stream)
    tmp_arr.copy_to_device(dev_arr)
    return t


In [13]:
toy_lstm = ToyLSTM(weights_matrix=devndarray2tensor(vocab), hidden_size=10, output_size=1, num_layers=3).cuda()
toy_lstm


ToyLSTM(
  (embedding): Embedding(2707, 50)
  (lstm): LSTM(50, 10, num_layers=3, batch_first=True)
  (linear): Linear(in_features=10, out_features=5, bias=True)
  (out): Linear(in_features=5, out_features=1, bias=True)
  (relu): ReLU()
)

In [14]:
X_train.flags
y_train.flags

{'F_CONTIGUOUS': True, 'C_CONTIGUOUS': True}

In [15]:
train = TensorDataset(devndarray2tensor(X_train).to(torch.int64), devndarray2tensor(y_train))
trainloader = DataLoader(train, batch_size=128)

In [16]:
loss_function = nn.BCEWithLogitsLoss(reduction='mean')
optimizer = optim.Adam(toy_lstm.parameters())

In [17]:
for epoch in range(1, 10):
    train_loss, valid_loss = [], []

    # training part
    toy_lstm.train()
    for data, target in trainloader:
        optimizer.zero_grad()
        output = toy_lstm(data)
        loss = loss_function(output, target.view(-1,1))
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    
    ## evaluation part 
    toy_lstm.eval()
    for data, target in trainloader:
        output = toy_lstm(data)


In [18]:
train_loss


[0.6914850473403931,
 0.6998982429504395,
 0.6914698481559753,
 0.6933820843696594,
 0.6966598033905029,
 0.69050133228302,
 0.692365288734436,
 0.6894925236701965]