In [3]:
from models_.building.llama_tokenizer import load_tokenizer

from data.pubmed.from_json import FromJsonDataset
from data.pubmed.contents import ContentsDataset

from storage.faiss_ import FaissStorage

from rag.tokenization.llama import build_tokenizer_function
from rag.quering import build_querier

In [4]:
storage = FaissStorage(
    dimension=800,
)

storage.load("../../outputs/store/pubmed_500K.index")

In [5]:
tokenizer = load_tokenizer()
tokenizer_fn = build_tokenizer_function(tokenizer)

In [6]:
data_json = FromJsonDataset(json_file="../../data/pubmed_500K.json")
data = ContentsDataset(data_json)


In [7]:
data_json[40]

{'title': 'Bile acids. XLVII. 12alpha-Hydroxylation of precursors of allo bile acids by rabbit liver microsomes.',
 'content': 'Rabbit liver microsomal preparations fortified with 0.1 mM NADPH effectively promote hydroxylation of [3beta-3H]- or [24-14C]allochenodeoxycholic acid or [5alpha,6alpha-3H2]5alpha-cholestane-3alpha,7alpha-diol to their respective 12alpha-hydroxyl derivatives in yields of about 25 or 65% in 60 min. Minor amounts of other products are formed from the diol. The requirements for activity of rabbit liver microsomal 12alpha-hydroxylase resemble those of rat liver microsomes. Of a number of enzyme inhibitors studied only p-chloromercuribenzoate demonstrated a marked ability to inhibit the reaction with either tritiated substrate. There was no difference in the quantity of product produced from the tritiated acid or the 14C-labeled acid. No clear sex difference was found in activity of the enzyme, nor was an appreciable difference noted in activity of the enzyme betwe

In [8]:
data[436930]

'Ringed enamel hyperplasia. An apparently new disorder of enamel is briefly described.'

In [22]:
query = "some other query"

In [23]:
# search the index
k = 5
querier = build_querier(storage, data, tokenizer_fn)
D, I = querier(query, k)

Querying for key: [[ 15031.   1023.   3319. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001. 128001.
  128001. 128001. 12

In [24]:
tokenized = tokenizer_fn(query)

In [25]:
storage.index.search(tokenized, k)

(array([[1.9379123e+11, 2.0661859e+11, 2.2285648e+11, 2.3180259e+11,
         2.3299378e+11]], dtype=float32),
 array([[436930, 173835, 267151, 375300, 314459]]))

In [26]:
I

['Ringed enamel hyperplasia. An apparently new disorder of enamel is briefly described.',
 'Sound-field hearing tests. Facilities for sound-field testing of hearing and calibration procedures are described.',
 'A 2 hand phantom. A technique for embedding hand and wrist bones in plastic resin is described.',
 'Primary splenic pregnancy. Case report. A patient with a primary splenic pregnancy is described.',
 'An expedient lid retracter. An expedient lid retractor may be fashioned from a paper clip.']

In [12]:
D

[204255199232.0,
 231838334976.0,
 247303438336.0,
 255331958784.0,
 257230209024.0]

In [13]:
# get the data from the index
for i in range(0, len(I)):
    print(I[i])

Ringed enamel hyperplasia. An apparently new disorder of enamel is briefly described.
Sound-field hearing tests. Facilities for sound-field testing of hearing and calibration procedures are described.
Primary splenic pregnancy. Case report. A patient with a primary splenic pregnancy is described.
A 2 hand phantom. A technique for embedding hand and wrist bones in plastic resin is described.
An expedient lid retracter. An expedient lid retractor may be fashioned from a paper clip.


In [14]:
import faiss
vectors = faiss.vector_to_array(storage.index.xb).reshape(storage.index.ntotal, storage.index.d)
print("Stored vectors:\n", vectors)

AttributeError: 'IndexFlatL2' object has no attribute 'xb'

In [26]:
index = storage.index
index.ntotal

490001

In [34]:
import torch

In [36]:
tokenizer.decode(
    torch.tensor(tokenized),
    skip_special_tokens=True
)

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [31]:
tokenized

array([[  8747.,  16629.,    323.,  13200.,   2191.,  12599.,  45202.,
         45202.,  45202.,  45202.,  45202.,  45202.,  45202.,  45202.,
         25013., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
        128001., 128001., 128001., 128001., 128001., 128001., 128001.,
      