In [1]:
from transformers import AutoTokenizer
from collections import Counter, defaultdict
import torch
from tqdm import tqdm
import json
import psutil

from utils import get_bytes_in_dtype, get_coo_space_GB, get_csr_or_csc_space_GB

  from .autonotebook import tqdm as notebook_tqdm
2023-09-06 16:37:29.489948: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-06 16:37:30.049759: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-06 16:37:30.049801: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
from IPython.core.magic import register_cell_magic

@register_cell_magic('ram_profiler')
def ram_profiler(line, cell):
    ram_before = psutil.virtual_memory().used / 1024 / 1024 / 1024
    exec(cell,  globals())
    ram_now = psutil.virtual_memory().used / 1024 / 1024 / 1024
    print(f"Ram Profiler | Ram diff: {ram_now-ram_before:.4f} GB")
    #return line, cell

In [3]:
hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(hf_tokenizer.vocab_size)

30522


In [4]:
def get_bow(text, tokenizer):
    bow = defaultdict(float)
    for t in tokenizer(text, add_special_tokens=False).input_ids:
        bow[t]+=1.0
    return bow

PATH_TO_MSMARCO = "../syn-question-col-analysis/datasets/msmarco/corpus_L8841823.jsonl"

def get_title_abstract(string):
    data = json.loads(string)
    return data["title"], data["abstract"]


#batch_i = 0
#batched_collection = [[]]

def get_matrix_estimations(collection_iterator, 
                      collection_maxsize, 
                      hf_tokenizer, 
                      dtype=torch.float32, 
                      max_files_for_estimation=1000):
    
    list_bow = []
    for _ in range(max_files_for_estimation):
        title, abstract = next(collection_iterator)
        list_bow.append(get_bow(f"{title} {abstract}", hf_tokenizer))

    dense_size = hf_tokenizer.vocab_size*len(list_bow)
    density = sum([sum(bow.values()) for bow in list_bow])/dense_size
    shape = (hf_tokenizer.vocab_size, collection_maxsize)
    
    return shape, density, list_bow


In [26]:
def create_coo_collection(collection_path, 
                          collection_maxsize, 
                          hf_tokenizer, 
                          dtype=torch.float32, 
                          max_files_for_estimation=1000):
    
    with open(collection_path) as f:
        collection_iterator = map(get_title_abstract,f)
        
        shape, density, list_bow = get_matrix_estimations(collection_iterator, collection_maxsize, hf_tokenizer, dtype, max_files_for_estimation)
        
        mem_needed = get_coo_space_GB(shape=shape, density=density,dtype=dtype)
        elements_expected = int(shape[0] * shape[1] * density)
        print(f"We estimate that the collection matrix will have density of {density:.4f}, which requires {mem_needed} GB. Plus 0.5GB for overheads.")
        
        # make a verification if it fits the GPU mem and CPU! plus add strategies if doesnt
        print(f"Expected number of elements {elements_expected} for a shape {shape}")
        
        ## creating the tensors
        indices = torch.zeros((2,elements_expected), dtype=torch.int64)
        values = torch.zeros((elements_expected,), dtype=dtype)
        
        element_index = 0
        colum_index = 0
        ## add values for the current processed documents
        for bow in list_bow:
            for token_index in sorted(bow.keys()):
                
                indices[0,element_index] = token_index
                indices[1,element_index] = colum_index
                values[element_index] = bow[token_index]
                
                element_index+=1
            colum_index+=1
            del bow
        
        del list_bow
        
        index_docs = 0
        
        for title, abstract in tqdm(collection_iterator, total=collection_maxsize-max_files_for_estimation):
            
            if index_docs>=(collection_maxsize-max_files_for_estimation):
                break
            
            bow = get_bow(f"{title} {abstract}", hf_tokenizer)
            py_indices_row = []
            py_indices_col = []
            py_values = []
            for token_index in sorted(bow.keys()):
                py_indices_row.append(token_index)
                py_indices_col.append(colum_index)
                py_values.append(bow[token_index])
                
            indices[:,element_index:element_index+len(py_indices_row)] = torch.tensor([py_indices_row,py_indices_col], dtype=torch.int64)
            values[element_index:element_index+len(py_indices_row)] = torch.tensor(py_values, dtype=dtype)
            colum_index+=1
            element_index+=len(py_indices_row)
            index_docs+=1
    # narrow
    indices = indices[:,:element_index]
    values = values[:element_index] 
    
    
    return torch.sparse_coo_tensor(indices, values, shape)
        #for title, abstract in collection_iterator:
            
        

In [27]:
%%ram_profiler

space_coo = create_coo_collection(PATH_TO_MSMARCO, 1_000_000, hf_tokenizer) #1000 its/s

We estimate that the collection matrix will have density of 0.0023, which requires 1.107248 GB. Plus 0.5GB for overheads.
Expected number of elements 69203000 for a shape (30522, 1000000)


100%|██████████| 999000/999000 [04:17<00:00, 3884.01it/s]


Ram Profiler | Ram diff: 0.0832 GB


In [13]:
space_coo

tensor(indices=tensor([[  1012,   1025,   1996,  ...,  10651,  10995,  15728],
                       [     0,      0,      0,  ..., 999999, 999999, 999999]]),
       values=tensor([2., 1., 6.,  ..., 3., 1., 2.]),
       size=(30522, 1000000), nnz=47904978, layout=torch.sparse_coo)

In [13]:
#sorted(bow.keys()) 47904904

[1996, 1997, 2003, 2015, 2023, 2033, 2507, 2546, 4933, 14383, 24646]

In [9]:
gc.collect()

48

In [11]:
indices.shape

torch.Size([2, 46247])

In [12]:
torch.tensor( [[1],[2]]).shape

torch.Size([2, 1])