In [1]:
import spare
from spare.metadata import MetaDataDocID
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs = [{
        "id": "my first document",
        "contents": "This is my first document in my document collection"
    },{
        "id": "my second document",
        "contents": "This is another example of a shorter document"
    }]

collection_mapped = map(lambda doc: (doc["id"], doc["contents"]), docs)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bow = spare.BagOfWords(lambda x: tokenizer(x, add_special_tokens=False).input_ids, tokenizer.vocab_size)

In [3]:
collection = spare.SparseCollection.from_text_iterator(collection_mapped, 
                                                       text_to_vec=bow,
                                                       collection_maxsize=len(docs),
                                                       dtype=spare.float32,
                                                       backend="torch")

  return torch._C._cuda_getDeviceCount() > 0
Size estimation: 100%|██████████| 2/2 [00:00<00:00, 435.59it/s]


We estimate that the collection matrix will have density of 0.0003, which requires 1.34e-07 GB. Plus 0.5GB for overheads.
Expected number of elements 15 for a shape (2, 30522)


Creating sparse matrix: 0it [00:00, ?it/s]


In [4]:
collection.transform(spare.BM25Transform(k1=1.2, b=0.75))

Converting to BM25 weighted collection: 100%|██████████| 2/2 [00:00<00:00, 1158.97it/s]


In [5]:
sparse_retriver = spare.SparseRetriever(collection, algorithm="iterative", objective="performance")

Collection is already in BM25 weighting schema, using its parameters
Torch convert tensors from CSR to CSC

Runner configuration:

Hardware
  accelerators: ['cpu']
  memory per device: 119.76
Collection
  shape: (2, 30522)
  values dtype: spare.float32
  indices dtype: spare.int32
  memory required: 0.00
  memory required (safe margin): 0.00
Plan
  running mode: Single forward
  algorithm: iterative product
  objective: performance



  csc_tensor = torch.sparse_csr_tensor(*sparse_collection.sparse_vecs, sparse_collection.shape).to_sparse_csc()


In [6]:
question = {
  7820: 1.0,
  6254: 1.0,
}

sparse_retriver.retrieve([question], top_k=10, return_scores=True)

100%|██████████| 1/1 [00:00<00:00, 387.61it/s]

Retrieval time: 0.008450508117675781 QPS 118.33607944927209
Mem transference time: 0.0002300739288330078
Time to convert docs ids 0.00019478797912597656





RetrievalOutput(ids=array([['my first document', 'my second document']], dtype='<U18'), scores=tensor([[0, 0]], dtype=torch.uint8), timmings=(118.33607944927209, 0.0002300739288330078))

In [7]:
### From vector
collection_mapped = map(lambda doc: (doc["id"], doc["contents"]), docs)
bow_docs = list( map(lambda x: (x[0], dict(bow(x[1]))), collection_mapped))


In [8]:
collection = spare.SparseCollection.from_vec_iterator(iter(bow_docs),
                                                      vec_dim=bow.dim,
                                                       collection_maxsize=len(docs),
                                                       dtype=spare.float32,
                                                       backend="torch")

Size estimation: 100%|██████████| 2/2 [00:00<00:00, 13168.93it/s]


We estimate that the collection matrix will have density of 0.0003, which requires 1.34e-07 GB. Plus 0.5GB for overheads.
Expected number of elements 15 for a shape (2, 30522)


Creating sparse matrix: 0it [00:00, ?it/s]


Size estimation: 100%|██████████| 2/2 [00:00<00:00, 10922.67it/s]


We estimate that the collection matrix will have density of 0.0003, which requires 1.34e-07 GB. Plus 0.5GB for overheads.
Expected number of elements 15 for a shape (2, 30522)


Creating sparse matrix: 0it [00:00, ?it/s]


In [34]:
bow.dim

30522

In [5]:
tokenizer("shorter document")

{'input_ids': [101, 7820, 6254, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [7]:
sparse_retriver = spare.SparseRetriever(collection, algorithm="iterative", objective="performance")

Collection is already in BM25 weighting schema, using its parameters
Torch convert tensors from CSR to CSC

Runner configuration:

Hardware
  accelerators: ['cpu']
  memory per device: 119.76
Collection
  shape: (2, 30522)
  values dtype: spare.float32
  indices dtype: spare.int32
  memory required: 0.00
  memory required (safe margin): 0.00
Plan
  running mode: Single forward
  algorithm: iterative product
  objective: performance



In [8]:
question = {
  7820: 1.0,
  6254: 1.0,
}

sparse_retriver.retrieve([question], top_k=10, return_scores=True)

100%|██████████| 1/1 [00:00<00:00, 77.34it/s]

Retrieval time: 0.018156051635742188 QPS 55.07805441748083
Mem transference time: 0.0008349418640136719
Time to convert docs ids 0.00018835067749023438





RetrievalOutput(ids=array([['my first document', 'my second document']], dtype='<U18'), scores=tensor([[0, 0]], dtype=torch.uint8), timmings=(55.07805441748083, 0.0008349418640136719))