In [1]:
from datasets import load_dataset

pubmed = load_dataset(
    'pubmed_qa',
    'pqa_labeled',
    split='train'
)

limit = 384

def chunker(contexts: list):
    chunks = []
    all_contexts = ' '.join(contexts).split('.')
    chunk = []
    for context in all_contexts:
        chunk.append(context)
        if len(chunk) >= 3 and len('.'.join(chunk)) > limit:
            # surpassed limit so add to chunks and reset
            chunks.append('.'.join(chunk).strip()+'.')
            # add some overlap between passages
            chunk = chunk[-2:]
    # if we finish and still have a chunk, add it
    if chunk is not None:
        chunks.append('.'.join(chunk))
    return chunks

data = []
for record in pubmed:
    chunks = chunker(record['context']['contexts'])
    for i, context in enumerate(chunks):
        data.append({
            'id': f"{record['pubid']}-{i}",
            'context': context
        })

from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# check device being run on
if device != 'cuda':
    print("==========\n"+
          "WARNING: You are not running on GPU so this may be slow.\n"+
          "If on Google Colab, go to top menu > Runtime > Change "+
          "runtime type > Hardware accelerator > 'GPU' and rerun "+
          "the notebook.\n==========")

dense_model = SentenceTransformer(
    'msmarco-bert-base-dot-v5',
    device=device
)

emb = dense_model.encode(data[0]['context'])
dim = dense_model.get_sentence_embedding_dimension()

from splade.models.transformer_rep import Splade

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to(device)  # move to GPU if possible
sparse_model.eval()

  from .autonotebook import tqdm as notebook_tqdm


If on Google Colab, go to top menu > Runtime > Change runtime type > Hardware accelerator > 'GPU' and rerun the notebook.


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

sample = data[0]['context']
splade_inputs = tokenizer(sample, return_tensors='pt')

# creates sparse vectors
with torch.no_grad():
    sparse_emb = sparse_model(
        d_kwargs=splade_inputs.to(device)
    )['d_rep'].squeeze()



In [5]:
sparse_emb.shape

torch.Size([30522])

In [7]:
# pineconde expects dictionary style format for sparse vectors

non_zero_indices = sparse_emb.nonzero().squeeze().cpu().tolist()
print(len(non_zero_indices))
values = sparse_emb[non_zero_indices].cpu().tolist()
sparse = {"indices": non_zero_indices, "values": values}

174


In [8]:
idx2token = {idx: token for token, idx in tokenizer.get_vocab().items()}

In [None]:
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(indices, values)
}

In [12]:
non_zero_indices[100], values[100]

(9607, 1.0227527618408203)

In [13]:
{idx2token[non_zero_indices[100]]: round(values[100], 2)}

{'veins': 1.02}

In [9]:
values

[0.6246443390846252,
 0.45678940415382385,
 0.3088974952697754,
 0.15812599658966064,
 0.07194814831018448,
 0.6496520042419434,
 0.9411975145339966,
 0.3161492645740509,
 0.759763777256012,
 1.9501705169677734,
 0.3237403333187103,
 0.3950244188308716,
 0.23536957800388336,
 0.2457110732793808,
 0.42533791065216064,
 1.9602453708648682,
 0.6289498805999756,
 0.42441168427467346,
 0.018046118319034576,
 0.19568762183189392,
 0.6684799790382385,
 0.8162305355072021,
 1.0954256057739258,
 0.1979701966047287,
 0.22766441106796265,
 0.013306856155395508,
 0.904829740524292,
 0.6024833917617798,
 0.6100096106529236,
 0.03979775682091713,
 0.12952309846878052,
 0.023475682362914085,
 0.3975697159767151,
 1.2144676446914673,
 0.7056940793991089,
 1.5106241703033447,
 0.5332852602005005,
 0.49861764907836914,
 0.4658374786376953,
 0.07503432780504227,
 1.6885474920272827,
 0.2525480091571808,
 0.03533470630645752,
 0.3232708275318146,
 1.3433905839920044,
 0.3039570748806,
 0.01360689383000135