## Load the local storage

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Enter into workspace

In [3]:
cd /content/drive/MyDrive/Etsy/QE_DEMO/Code

/content/drive/MyDrive/Etsy/QE_DEMO/Code


## Install tool-kits

In [4]:
!pip install -U sentence-transformers rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl

## (Optional) Datasets Encoder information

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import pickle

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

passages = []

# As dataset, we use Simple Etsy Datasets
data_path = '/content/drive/MyDrive/Etsy/samples/'
file_list = os.listdir(data_path)
print(file_list)

for file_name in file_list:
  with open(data_path+file_name, 'r') as EtsyJson:
    print(data_path+file_name)
    for line in EtsyJson:
      data = json.loads(line.strip())
      #passages.append(data['query'])
      passages.append(data['title'])
    print("Sub Passages:", len(passages))

print("Total Passages:", len(passages))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

embedding_cache_path = 'etsy-embeddings-gpu-total.pkl'
print("Store file on disc")
with open(embedding_cache_path, "wb") as fOut:
  pickle.dump({'sentences': passages, 'embeddings': corpus_embeddings}, fOut)

## Load Models and Pre-computed Embeddings

In [5]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import pickle

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

print("Load pre-computed embeddings from disc")
embedding_cache_path = 'etsy-embeddings-gpu.pkl'

with open(embedding_cache_path, "rb") as fIn:
  cache_data = pickle.load(fIn)
  passages = cache_data['sentences']
  corpus_embeddings = cache_data['embeddings']

Downloading (…)5fedf/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2cb455fedf/README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading (…)b455fedf/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)edf/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5fedf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)fedf/train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading (…)2cb455fedf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)455fedf/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Load pre-computed embeddings from disc


## Load the main functuion of query expansion

In [23]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input query:", query)
    total_qe = []

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    #print("Top-10 lexical search (BM25) hits")
    qe_string = []
    for hit in bm25_hits[0:1000]:
      if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
        qe_string.append(passages[hit['corpus_id']].replace("\n", ""))

    sub_string = []
    for item in qe_string:
      for sub_item in item.split(","):
        sub_string.append(sub_item)
    #print(sub_string)
    total_qe.append(sub_string)

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.cuda()
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-10 hits from bi-encoder
    #print("\n-------------------------\n")
    #print("Top-N Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    qe_string = []
    for hit in hits[0:1000]:
      if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
        qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
    #print(qe_string)
    total_qe.append(qe_string)

    # Output of top-10 hits from re-ranker
    #print("\n-------------------------\n")
    #print("Top-N Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    qe_string = []
    for hit in hits[0:1000]:
      if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
        qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
    #print(qe_string)
    total_qe.append(qe_string)

    # Total Results
    total_qe.append(qe_string)
    print("E-Commerce Query Expansion Results: \n")
    print(total_qe)


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)


  0%|          | 0/1591069 [00:00<?, ?it/s]

In [21]:
search(query = "gift")

Input query: gift
E-Commerce Query Expansion Results: 

[['mom gift ', ' friend gift ', ' gift box ', ' anniversary gift ', ' christmas gift ', ' couple gift ', ' wedding gift ', ' gift for him ', ' gift for her ', 'Mini dumpling gift', ' cute gift', ' anniversary gift', ' i love you gift', ' birthday gift', ' friend gift', ' love gift', ' Mother’s Day gift', ' mum gift', ' gift', 'Father’s day gift', ' gift for him', ' gift for boyfriend', ' personalized gift', ' customizable gift', ' daddy gift', ' gift for man'], ['Gift Delivery', 'Gift Card', 'Gift Packaging', 'Thinking of you gift', 'Wedding Gift, Gift For Couples', 'boyfriend gift, gift for boyfriend', 'New business gift', 'Personalized Gift Sets of your choosing', 'Valentines Gift Box', 'Gift Box - Men', 'Personalised APPRECIATION Gift Box', 'Personalized Gifts for Him', 'Free Surprise Gift With Every Purchase / Add This too With Your Order and Get Exciting Gift / Surprise Gift / Mystery Gift', 'Sister Gift, Personalised gift fo