# Indexing: load -> tokenize -> store

We are going to load our dataset, tokenize it and store it in a faiss index, all in one go.

At the end we are going to save the index to a file.

In [1]:
from models_.building.pubmed_tokenizer import load as load_tokenizer
from rag.indexing import index
from storage.faiss_ import FaissStorage
from data.pubmed.from_json import FromJsonDataset
from data.pubmed.tokenized import TokenizedDataset

In [2]:
dataset = FromJsonDataset(json_file="../data/pubmed_500K.json")
dataset[0]

{'title': "[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)].",
 'content': '(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.',
 'contents': "[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)]. (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only o

In [3]:
tokenizer = load_tokenizer()
tokenizer

BertTokenizerFast(name_or_path='ncbi/MedCPT-Article-Encoder', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
MAX_TOKEN_LENGTH = 800 # check train_dataset.ipynb for more details

In [5]:
tokenized_dataset = TokenizedDataset(
    tokenizer=tokenizer,
    dataset=dataset,
    max_length=MAX_TOKEN_LENGTH,
)

In [6]:
tokenized_dataset[0]

{'input_ids': tensor([[   12,    17,    17,    13,    17,  4595,    17,  7266,  3016,  1945,
          2258,    43,  3065,  2246,  9626, 13575,  4615,  6362,  1990,  9998,
            16,  2154,  1977,  2084,  4189,  2007,  1925, 10076,  1927,  1920,
          2087,    17,  3113,    18,  1920, 12712,  2455,  1927, 27979,  1977,
          3028,  2007,  2761,  3564,  2596,  2459,  1927,  7266,  3016,  1945,
          1922,  1920,  3400,  1927,    21,    19,    20,    18,    25,    18,
          1920,  2246,  9626, 13575,  4615,  1927,  7266,  3016,  1945,  2444,
          5110,  1922,  3087,  1927,  3083,  5217,    18,  1922,  3087,  1927,
            43,  2555,  5217,  1956,  1920,  4510,    16,  1920,  7918,  2495,
          1977,  7621,    18,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [7]:
# check the decoded content
tokenizer.decode(tokenized_dataset[0]["input_ids"][0], skip_special_tokens=True)

'( - - ) - alpha - bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the ph - value. the proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1 / 0. 5. the antipeptic action of bisabolol only occurs in case of direct contact. in case of a previous contact with the substrate, the inhibiting effect is lost.'

In [8]:
storage = FaissStorage(
    dimension=MAX_TOKEN_LENGTH,
    data_to_use_for_training=10000,
)

In [9]:
def item_transform(item):
    """
    Transforms the item to be stored in the storage system.
    """
    # convert to numpy array
    return item["input_ids"][0].numpy().astype("float32")

In [10]:
storage = index(
    data=tokenized_dataset,
    storage=storage,
    data_transform=item_transform,
)

Indexed 0 items
Indexed 1000 items
Indexed 2000 items
Indexed 3000 items
Indexed 4000 items
Indexed 5000 items
Indexed 6000 items
Indexed 7000 items
Indexed 8000 items
Indexed 9000 items
Training the index with the training data...
(10000, 800)
Index trained successfully.
Indexed 10000 items
Indexed 11000 items
Indexed 12000 items
Indexed 13000 items
Indexed 14000 items
Indexed 15000 items
Indexed 16000 items
Indexed 17000 items
Indexed 18000 items
Indexed 19000 items
Indexed 20000 items
Indexed 21000 items
Indexed 22000 items
Indexed 23000 items
Indexed 24000 items
Indexed 25000 items
Indexed 26000 items
Indexed 27000 items
Indexed 28000 items
Indexed 29000 items
Indexed 30000 items
Indexed 31000 items
Indexed 32000 items
Indexed 33000 items
Indexed 34000 items
Indexed 35000 items
Indexed 36000 items
Indexed 37000 items
Indexed 38000 items
Indexed 39000 items
Indexed 40000 items
Indexed 41000 items
Indexed 42000 items
Indexed 43000 items
Indexed 44000 items
Indexed 45000 items
Indexed

In [13]:
import faiss
# save the index to a file
faiss.write_index(storage.index, "../outputs/store/pubmed_500K.index")