In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

0
NVIDIA RTX A5000


In [3]:
from retriv import DenseRetriever
import pandas as pd
from src.utils.project_dirs import get_gen_dir_dataset, processed_data_dir, get_bm25_indexes_dir,get_minilm_index_dir


In [20]:
meta_filepath = processed_data_dir(dataset='beauty2014')
meta_corpus = pd.read_json(meta_filepath/'meta_corpus.json', orient='records', lines=True)
meta_corpus.columns = ['asin', 'Title']
asins_compact = meta_corpus[['asin']].copy()
asins_compact['nlang'] = "Title: " + meta_corpus['Title']
asin_dict = asins_compact.set_index('asin')['nlang'].to_dict() # asin to serialized natural language string

In [5]:
# print 10 items from asin_dict
display(list(asin_dict.items())[:10])


[('7806397051',
  'Title: WAWO 15 Color Professionl Makeup Eyeshadow Camouflage Facial Concealer Neutral Palette'),
 ('9759091062', 'Title: Xtreme Brite Brightening Gel 1oz.'),
 ('9788072216',
  'Title: Prada Candy By Prada Eau De Parfum Spray 1.7 Oz For Women'),
 ('9790790961',
  'Title: Versace Bright Crystal Eau de Toilette Spray for Women, 3 Ounce'),
 ('9790794231', 'Title: Stella McCartney Stella'),
 ('B00004TMFE',
  'Title: Avalon Biotin B-Complex Thickening Conditioner, 14 Ounce'),
 ('B00004TUBL', 'Title: Better Living Classic Two Chamber Dispenser, White'),
 ('B00004TUBV', 'Title: Better Living The Ulti-Mate Dispenser'),
 ('B00004U9V2',
  "Title: Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ"),
 ('B00004U9UY',
  'Title: Crabtree &amp; Evelyn 2792 Gardeners Hand Therapy (100ml, 3.4 oz)')]

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
## max string length
max_strlen = 0
for k, v in asin_dict.items():
    max_strlen = max(len(v), max_strlen)

print(f"Max string length: {max_strlen}")


## max tokenized length
max_tokenized_len = 0
for k, v in asin_dict.items():
    max_tokenized_len = max(len(tokenizer.encode(v)), max_tokenized_len)

print(f"Max tokenized length: {max_tokenized_len}")

max_tokenized_len


Max string length: 239


Max tokenized length: 75


75

In [5]:
index_dir = get_minilm_index_dir()/'beauty2014'
# dr = DenseRetriever(
#   index_name=str(index_dir), # this should be the path where you wanna save the index
#   model="sentence-transformers/all-MiniLM-L6-v2",
#   normalize=True,
#   max_length=256,
#   use_ann=False,
# )

In [7]:
type(dr)

retriv.dense_retriever.dense_retriever.DenseRetriever

In [8]:
dr = dr.index_file(
  path=str(meta_filepath/'meta_corpus.jsonl'),
  embeddings_path=None,
  use_gpu=True,              # Default value
  batch_size=128,             # Default value
  show_progress=True,         # Default value
  callback=lambda doc: {      # Callback defaults to None.
    "id": doc["asin"],
    "text": "Title: " + doc['title'],          
  }
)

Embedding documents: 100%|██████████| 12094/12094 [00:02<00:00, 5205.80it/s]

Loading embeddings...





In [None]:
dr.save() # saves to name you provided in index_name in the constructor

In [6]:
drl = DenseRetriever.load(index_name=str(index_dir))

In [15]:
drl.search(query="hair extension", return_docs=False, cutoff=5)

{'B00ECUPDYC': 0.66617227,
 'B00889VG6W': 0.66324776,
 'B00DPVH87Y': 0.6358326,
 'B00BSKU6OG': 0.6352354,
 'B008HZ5GUY': 0.63364077}

In [18]:
queries = [{"id": "q_1", "text": "hair extension"}, {"id": "q_2", "text": "aloe vera"}]
drl.msearch(queries=queries, cutoff=5)

{'q_1': {'B00ECUPDYC': 0.66617227,
  'B00889VG6W': 0.66324776,
  'B00DPVH87Y': 0.6358326,
  'B00BSKU6OG': 0.6352354,
  'B008HZ5GUY': 0.63364077},
 'q_2': {'B000052YM0': 0.72061837,
  'B0080EEMCA': 0.7192248,
  'B00016WXEY': 0.71025586,
  'B00028OSI0': 0.7067425,
  'B0009VNI40': 0.6984346}}

In [22]:
asin_dict['B00ECUPDYC'], asin_dict['B000052YM0']

('Title: NSSTAR Newfangled Fashionable Multicolor Gradually Varied One Piece Straight Synthetic Clip-on Hair Extension 60cm Length,Multiple Choice (Straight, Dark Purple to Rose Red)',
 'Title: Fruit Of The Earth 100% Aloe Vera 6oz. Gel Tube')

In [34]:
'''Batch-Search is similar to Multi-Search but automatically generates batches 
of queries to evaluate and allows dynamic writing of the search results to disk in JSONl format. 
bsearch is handy for computing results for hundreds of thousands or even millions of queries without hogging your RAM.'''

queries = [{"id": "rid_0", "text": "hair extension"}, {"id": "rid_1", "text": "aloe vera"}]

drl.bsearch(queries=queries,
  cutoff=1,
  batch_size=2,
  show_progress=False,
  qrels=None,
  path=None,
)

{'rid_0': {'B00ECUPDYC': 0.66617227}, 'rid_1': {'B000052YM0': 0.72061837}}

In [37]:
drl.msearch(queries=queries,
  cutoff=1
)

{'rid_0': {'B00ECUPDYC': 0.66617227}, 'rid_1': {'B000052YM0': 0.72061837}}

In [30]:
asin_dict['B00ECUPDYC'], asin_dict['B000052YM0']

('Title: NSSTAR Newfangled Fashionable Multicolor Gradually Varied One Piece Straight Synthetic Clip-on Hair Extension 60cm Length,Multiple Choice (Straight, Dark Purple to Rose Red)',
 'Title: Fruit Of The Earth 100% Aloe Vera 6oz. Gel Tube')