In [1]:
from transformers import AutoTokenizer
from collection import SparseCollection, SparseCollectionCSR
from backend import TYPE
import json
import psutil
from text2vec import BagOfWords
import torch
from collections import defaultdict
from tqdm import tqdm
from transformations import BM25Transform

In [2]:
hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(hf_tokenizer.vocab_size)


bow = BagOfWords(hf_tokenizer, hf_tokenizer.vocab_size)

30522


In [3]:
PATH_TO_MSMARCO = "../syn-question-col-analysis/datasets/msmarco/corpus_L8841823.jsonl"

def get_title_abstract(string):
    data = json.loads(string)
    title, abstract = data["title"], data["abstract"]
    return f"{title} {abstract}"

In [4]:
with open(PATH_TO_MSMARCO) as f:
    collection_iterator = map(get_title_abstract,f)
    
    sparseCSR_collection = SparseCollectionCSR.from_text_iterator(collection_iterator,
                                                             collection_maxsize=1_000_000,#8841823,
                                                             text_to_vec=bow,
                                                             dtype=TYPE.float32,
                                                             indices_dtype=TYPE.int32,) 

Size estimation: 100%|██████████| 1000/1000 [00:00<00:00, 4858.68it/s]


We estimate that the collection matrix will have density of 0.0023, which requires 0.557624 GB. Plus 0.5GB for overheads.
Expected number of elements 69203000 for a shape (1000000, 30522)


Creating sparse matrix:   1%|          | 8555/999000 [00:02<04:25, 3736.36it/s]

In [None]:
sparseCSR_collection.transform(BM25Transform())
sparseCSR_collection.sparse_vecs

Converting to BM25 matrix: 100%|██████████| 1000000/1000000 [00:36<00:00, 27678.88it/s]


(tensor([       0,       40,       67,  ..., 47904868, 47904922, 47904978],
        dtype=torch.int32),
 tensor([ 1012,  1025,  1996,  ..., 10651, 10995, 15728], dtype=torch.int32),
 tensor([0.0214, 3.0450, 0.2211,  ..., 7.6188, 5.6052, 9.8994]))

In [4]:
#sparseCSR_collection.save_to_file("csr_msmarco.safetensors")
# load tensors
from safetensors import safe_open

tensors = {}
with safe_open("csr_msmarco.safetensors", framework="pt", device="cpu") as f:
   for key in f.keys():
        tensors[key] = f.get_tensor(key)
       
tensors = [tensors["vec_0"], tensors["vec_1"], tensors["vec_2"]]


In [5]:
csr_matrix_cpu = torch.sparse_csr_tensor(*tensors, (8841823, 30522))


  csr_matrix_cpu = torch.sparse_csr_tensor(*tensors, (8841823, 30522))


In [6]:
def text_to_dense_torch(text, dim):
    b = bow(text)
    return torch.sparse_coo_tensor([list(b.keys())], list(b.values()), (dim,), dtype=torch.float32).to_dense()


In [7]:
csr_matrix_gpu = csr_matrix_cpu.to("cuda:1")


In [8]:
PATH_TO_QUESTIONS = "../syn-question-col-analysis/datasets/msmarco/relevant_pairs.jsonl"


In [20]:

with open(PATH_TO_QUESTIONS) as f:
    questions = {line["question"] for line in map(json.loads,f)}

questions = [text_to_dense_torch(q, csr_matrix_cpu.shape[1]) for q in questions]
queries_gpu = torch.stack(questions[:20], -1).to("cuda:1")

#sort, idx = (csr_matrix_gpu @ queries_gpu).sort(descending=True, dim=0)
#result = idx[:10].cpu()
result = torch.topk(csr_matrix_gpu @ queries_gpu, k=10, dim=0).indices.T.cpu()

In [22]:
result

tensor([[8617271, 7607669, 5466810, 1379245, 1379240, 5466807, 1664523, 8617274,
          547444,  269428],
        [1929910, 3572695, 7839904, 1288938, 4842897, 2507917, 7839906, 3572702,
         5359212, 3937200],
        [2533260, 5012351, 8121380,  719552, 5291683, 7952865, 5291686,  719550,
         6528714, 7088568],
        [8049577, 1433123,  669004, 4563960, 1584254, 3410067, 2055598, 8049578,
           16845, 3552218],
        [8635981, 7267248,  527698, 3260688, 1837110, 1958102, 1958100, 7267243,
         8199361, 7367407],
        [8760867, 8760864, 3641634, 2787508, 4788864, 2157456, 8760868, 8760873,
         3620983, 3342992],
        [7778351, 2868845, 6436703, 7778348, 4164404, 4337532, 2197526, 2997653,
         7670593, 2160853],
        [7447941, 8433858, 6654655, 8433854, 7896211, 2747492, 2365660, 5638740,
         4704978, 2702419],
        [8160520, 3838645,  554521, 4511137,  398442, 4575877, 5218014, 8160527,
         1901881, 8478604],
        [ 536176, 3

In [13]:
del queries_gpu
del result

In [13]:
import time

time_list = []
for _ in range(20):
    
    with open(PATH_TO_QUESTIONS) as f:
        questions = {line["question"] for line in map(json.loads,f)}

    questions = [text_to_dense_torch(q, csr_matrix_cpu.shape[1]) for q in questions]
    queries_gpu = torch.stack(questions, -1).to("cuda:1")

    start_t = time.time()
    #sort, idx = (csr_matrix_gpu @ queries_gpu).sort(descending=True, dim=0)
    #result = idx[:10].cpu()
    result = torch.topk(csr_matrix_gpu @ queries_gpu, k=100000, dim=0).indices.cpu()
    time_list.append(time.time()-start_t)
    
    for q in questions:
        del q
    del questions
    del queries_gpu
    del result 
    #del sort
    #del idx

In [14]:
time_list

[0.4787881374359131,
 0.44898533821105957,
 0.4485352039337158,
 0.4484899044036865,
 0.44817113876342773,
 0.4484434127807617,
 0.4484429359436035,
 0.44898128509521484,
 0.44836854934692383,
 0.44843435287475586,
 0.4483344554901123,
 0.4485807418823242,
 0.44898390769958496,
 0.44857192039489746,
 0.44869017601013184,
 0.448455810546875,
 0.44840002059936523,
 0.4485960006713867,
 0.4486415386199951,
 0.4485054016113281]

In [46]:
csr_matrix_gpu = csr_matrix_cpu.to("cuda:1")

In [68]:
queries = [text_to_dense_torch("what is the meaning of life?", csr_matrix_cpu.shape[1]),
           text_to_dense_torch("what time it is?", csr_matrix_cpu.shape[1]),
            ]

queries = torch.stack(queries, -1)
queries_gpu = queries.to("cuda:1")
    


tensor([[4.3629, 3.6962],
        [0.5774, 0.0000],
        [0.5461, 1.5242],
        ...,
        [0.5942, 0.0000],
        [0.5746, 2.5969],
        [1.1826, 0.5707]], device='cuda:1')

In [79]:
import time
time_list = []
for _ in range(20):
    
    queries = [text_to_dense_torch("what is the meaning of life?", csr_matrix_cpu.shape[1]),
           text_to_dense_torch("what time it is?", csr_matrix_cpu.shape[1]),
           text_to_dense_torch("what time it is? 2", csr_matrix_cpu.shape[1]),
           text_to_dense_torch("wagfsdf asdg asg ?", csr_matrix_cpu.shape[1]),
           #text_to_dense_torch("wwef faes ewq ta afewf s?", csr_matrix_cpu.shape[1]),
            ]

    queries = torch.stack(queries, -1)
    queries_gpu = queries.to("cuda:1")

    start_t = time.time()
    csr_matrix_gpu @ queries_gpu
    time_list.append(time.time()-start_t)

In [80]:
sum(time_list)/len(time_list)*1000


0.08742809295654297

In [88]:

with open(PATH_TO_QUESTIONS) as f:
    questions = {line["question"] for line in map(json.loads,f)}


In [91]:
queries_gpu = queries.to("cuda:1")

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.42 GiB (GPU 1; 7.79 GiB total capacity; 6.26 GiB already allocated; 1.19 GiB free; 6.49 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [94]:
sum(time_list)/len(time_list)*1000

0.1820206642150879

In [None]:

%%timeit -n 1000
retrieve_gpu_nvprims_nvfuser(query_gpu)

In [47]:
query_gpu = query.to("cuda:1")
retrieve_gpu4(csr_matrix_gpu, query_gpu)

tensor([4.3629, 0.5774, 0.5461,  ..., 0.5942, 0.5746, 1.1826], device='cuda:1')

In [48]:
%%timeit -l 1000
retrieve_gpu4(csr_matrix_gpu, query_gpu)

11.1 ms ± 78.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [44]:
%%timeit
csr_matrix @ query

11.1 ms ± 88 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [53]:
%%timeit
torch.topk(csr_matrix@query,k=10)

190 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


torch.Size([30522])

tensor(crow_indices=tensor([     0,     40,     67,  ..., 465841, 465881,
                            465917]),
       col_indices=tensor([ 1012,  1025,  1996,  ...,  9949, 17502, 26110]),
       values=tensor([2., 1., 6.,  ..., 1., 1., 3.]), size=(10000, 30522),
       nnz=465917, layout=torch.sparse_csr)

In [15]:
sparseCSR_collection.transform(BM25Transform())

Converting to BM25 matrix: 100%|██████████| 10000/10000 [00:00<00:00, 27992.58it/s]


In [16]:
sparseCSR_collection.sparse_vecs

(tensor([     0,     40,     67,  ..., 465841, 465881, 465917],
        dtype=torch.int32),
 tensor([ 1012,  1025,  1996,  ...,  9949, 17502, 26110], dtype=torch.int32),
 tensor([0.0000, 2.1513, 0.0000,  ..., 6.4540, 8.6054, 9.9709]))

In [17]:
torch.sparse_csr_tensor(*sparseCSR_collection.sparse_vecs, sparseCSR_collection.shape)

tensor(crow_indices=tensor([     0,     40,     67,  ..., 465841, 465881,
                            465917]),
       col_indices=tensor([ 1012,  1025,  1996,  ...,  9949, 17502, 26110]),
       values=tensor([0.0000, 2.1513, 0.0000,  ..., 6.4540, 8.6054, 9.9709]),
       size=(10000, 30522), nnz=465917, layout=torch.sparse_csr)