In [1]:
from transformers import AutoTokenizer
from collections import Counter, defaultdict
import torch
from tqdm import tqdm
import json
import psutil
import gc

  from .autonotebook import tqdm as notebook_tqdm
2023-09-06 16:18:59.130375: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-06 16:18:59.706228: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-06 16:18:59.706271: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
from IPython.core.magic import (Magics, magics_class, line_magic,
                                cell_magic, line_cell_magic)

from IPython.core.magic import register_cell_magic

@register_cell_magic('ram_profiler')
def ram_profiler(line, cell):
    ram_before = psutil.virtual_memory().used / 1024 / 1024 / 1024
    exec(cell,  globals())
    ram_now = psutil.virtual_memory().used / 1024 / 1024 / 1024
    print(f"Ram Profiler | Ram diff: {ram_now-ram_before:.4f} GB")
    #return line, cell



In [3]:
hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(hf_tokenizer.vocab_size)

30522


In [4]:



def get_ram():
    try:
        ram_info = psutil.virtual_memory()
        print(f"Total: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
        print(f"Available: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
        print(f"Used: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
        print(f"Percentage usage: {ram_info.percent}%")
    except FileNotFoundError:
        print("Ram info not available on this system")

def get_bow(text, tokenizer):
    bow = defaultdict(float)
    for t in tokenizer(text, add_special_tokens=False).input_ids:
        bow[t]+=1.0
    return bow

def get_bow_tensor(text, tokenizer, dense=False, dtype=torch.float):
    bow = get_bow(text, tokenizer)
    #bow = Counter(tokenizer(text, add_special_tokens=False).input_ids)
    indices = list(bow.keys())
    indices2d = [indices,[0]*len(indices)]
    values = list(bow.values())
    if dense:
        return torch.sparse_coo_tensor(indices2d, values, (tokenizer.vocab_size, 1), dtype=dtype).to_dense()
    else:
        
        return torch.sparse_coo_tensor(indices2d, values, (tokenizer.vocab_size, 1), dtype=dtype)


In [5]:
# document collection
PATH_TO_MSMARCO = "../syn-question-col-analysis/datasets/msmarco/corpus_L8841823.jsonl"

def get_title_abstract(string):
    data = json.loads(string)
    return data["title"], data["abstract"]

SIZE = 1_000_000
#batch_i = 0
#batched_collection = [[]]

def load_collection(SIZE):
    collections = []
    with open(PATH_TO_MSMARCO) as f:
        
        for i,(title, abstract) in enumerate(tqdm(map(get_title_abstract,f), total=SIZE)):
            collections.append(get_bow_tensor(f"{title} {abstract}", hf_tokenizer))
            if i>SIZE:
                break
    
    return collections

def load_collection_to_tensor(path_to_collection, size=8841823, dtype=torch.float32):

    batch_size = 250_000
    batch_i = 0
    _temp_collections = []
    sparse_collections = None
    with open(path_to_collection) as f:
        
        for i,(title, abstract) in enumerate(tqdm(map(get_title_abstract,f), total=size)):
            
            if i>=size:
                break
            
            sparse_vec = get_bow_tensor(f"{title} {abstract}", hf_tokenizer, dtype=dtype)
            
            if sparse_collections is None:
                sparse_collections = sparse_vec
            else:
                if batch_i<batch_size:
                    batch_i +=1
                    _temp_collections.append(sparse_vec)
                else:
                    batch_i = 0
                    _temp_collections.append(sparse_vec)
                    sparse_collections = torch.concat([sparse_collections]+_temp_collections, dim=-1)
                    _temp_collections = []
                    gc.collect()
            
            #collections.append(get_bow_tensor(f"{title} {abstract}", hf_tokenizer))

        sparse_collections = torch.concat([sparse_collections]+_temp_collections, dim=-1)
        _temp_collections = []
        gc.collect()
    
    return sparse_collections

def load_collection_to_coo(path_to_collection, hf_tokenizer, size=8841823, dtype=torch.float32):
    
    def get_max_values_for_coo_format(mem_available, num_bytes_per_value = 4):
        overhead = 1 * 1e6
        return int((mem_available-overhead)/(16+num_bytes_per_value))
    
    indice_row = []
    indice_col = []
    values = []
    
    with open(path_to_collection) as f:
        
        for i,(title, abstract) in enumerate(tqdm(map(get_title_abstract,f), total=size)):
            pass

In [6]:
%%ram_profiler

sparse_collection = load_collection_to_tensor(PATH_TO_MSMARCO, size=1_000_000,dtype=torch.float16)

 51%|█████     | 508916/1000000 [02:39<02:11, 3723.30it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
1000001it [05:11, 3206.68it/s]                            


Ram Profiler | Ram diff: 1.3721 GB


In [7]:
sparse_collection

tensor(indices=tensor([[   1996,    3739,    1997,  ...,    2011,    2039,
                           2322],
                       [      0,       0,       0,  ..., 1000000, 1000000,
                        1000000]]),
       values=tensor([6., 1., 5.,  ..., 1., 1., 1.]),
       size=(30522, 1000001), nnz=47905032, dtype=torch.float16,
       layout=torch.sparse_coo)

In [12]:
95959559/(30522*1999994)

0.0015719783578982478

In [7]:
%%ram_profiler

sparse_concat_collection = torch.concat(collections, dim=-1)

Ram Profiler | Ram diff: 0.8775 GB


In [8]:
%%ram_profiler
sparse_collection = sparse_concat_collection.to_sparse_csr()

Ram Profiler | Ram diff: 0.9530 GB




In [10]:
%%ram_profiler
del sparse_collection

Ram Profiler | Ram diff: -0.8960 GB


In [9]:
%%ram_profiler
del sparse_concat_collection

Ram Profiler | Ram diff: -0.8960 GB


In [12]:
%%ram_profiler
collections = []



Ram Profiler | Ram diff: -0.0021 GB


In [12]:
%%ram_profiler

del sparse_collection

In [83]:
bow_tensor("nothing", hf_tokenizer)

tensor(indices=tensor([[2498],
                       [   0]]),
       values=tensor([1.]),
       size=(30522, 1), nnz=1, layout=torch.sparse_coo)

In [5]:
a = get_bow_tensor("this is nothing", hf_tokenizer)
b = get_bow_tensor("this is nothing LOL", hf_tokenizer)

In [None]:
def bow_Counter_class(l_tokens):
    bow_c = Counter(l_tokens)
    return list(bow_c.keys()), list(bow_c.values())

In [49]:
indices = list(bow.keys())
indices2d = torch.tensor([indices,[0]*len(indices)])

values = torch.tensor(list(bow.values()), dtype=torch.float)

target = torch.zeros([hf_tokenizer.vocab_size,1], dtype=torch.float)  
target.index_put_(tuple([k for k in indices2d]), values)

tensor([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])

In [41]:

torch.sparse_coo_tensor(indices2d, bow.values(), (60_000,1), dtype=torch.float16)

TypeError: must be real number, not dict_values