In [6]:
from db.conn_db import db_query
from sentence_transformers import SentenceTransformer
import pickle
import hnswlib
import os

## Import the Sentences Transformer Model

In [4]:
model_name = './model'
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/319 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Import data from db

In [30]:
corpus = db_query()
corpus[:5]

['Interested in learning more about Dogecoin and how it works? Our guide will teach you a little more about the history of Dogecoin, how you can start investing in DOGE and how you can use Dogecoin after you invest.',
 '"In the 2017 peak, it hit 155 sats. And in 2021, it hit 200 sats, before correcting to approximately 100 sats," Chrysochou says. "In the most optimistic scenario for this bull cycle, Bitcoin reaches $300,000 and DOGE reaches the 300 sats mark, which is approximately 90 cents." Dogecoin, worth around 7 cents today, would still have to more than double in value relative to Bitcoin to reach the 90 cent mark. It\'s worth about 118 satoshis today.',
 'To prevent miner’s from fraudulently corrupting the blockchain, the Bitcoin protocol makes miners compete. A different miner is empowered to write each block, roughly every 10 minutes, and only valid blocks will be accepted by the rest of the mining community. Here’s how that works:',
 'Are Shiba Inus aggressive at all times? C

In [31]:
corpus = [i.replace('\n', '') for i in corpus]
corpus = [i.replace('\t', '') for i in corpus]
corpus

['Interested in learning more about Dogecoin and how it works? Our guide will teach you a little more about the history of Dogecoin, how you can start investing in DOGE and how you can use Dogecoin after you invest.',
 '"In the 2017 peak, it hit 155 sats. And in 2021, it hit 200 sats, before correcting to approximately 100 sats," Chrysochou says. "In the most optimistic scenario for this bull cycle, Bitcoin reaches $300,000 and DOGE reaches the 300 sats mark, which is approximately 90 cents." Dogecoin, worth around 7 cents today, would still have to more than double in value relative to Bitcoin to reach the 90 cent mark. It\'s worth about 118 satoshis today.',
 'To prevent miner’s from fraudulently corrupting the blockchain, the Bitcoin protocol makes miners compete. A different miner is empowered to write each block, roughly every 10 minutes, and only valid blocks will be accepted by the rest of the mining community. Here’s how that works:',
 'Are Shiba Inus aggressive at all times? C

In [32]:
len(corpus)

62696

### Embed the text

In [33]:
corpus_embeddings = model.encode(corpus, show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/1960 [00:00<?, ?it/s]

### Save the embeddings into a pickle file

In [34]:
with open('./embeddings/embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': corpus, 'embeddings': corpus_embeddings}, fOut)

In [None]:
# with open('./embeddings/embeddings.pkl', "rb") as fIn:
#     cache_data = pickle.load(fIn)
#     corpus_sentences = cache_data['sentences']
#     corpus_embeddings = cache_data['embeddings']

## HNSW

In [36]:
embedding_size = 768
top_k_hits = 25

index_path = "./hnswlib.index"
index = hnswlib.Index(space = 'ip', dim = embedding_size)

In [37]:
if os.path.exists(index_path):
    index.load_index(index_path)

else:
    index.init_index(max_elements = len(corpus_embeddings), ef_construction = 400, M = 64)
    index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))
    index.save_index(index_path)

In [38]:
index.set_ef(50)

### Queries

In [39]:
import pandas as pd

df = pd.DataFrame()

with open('queries.txt') as f:
    queries = f.readlines()

queries = [i.replace('\n', '') for i in queries]

for query in queries:
    question_embedding = model.encode(query)

    corpus_ids, distances = index.knn_query(question_embedding, k=top_k_hits)
    hits = [{'corpus_id': id, 'score': 1-score} for id, score in zip(corpus_ids[0], distances[0])]
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)

    for hit in hits[0:top_k_hits]:
        row = {'label': '', 'text': ''}

        row['label'] = query
        row['text'] = corpus[hit['corpus_id']]

        df = df.append(row, ignore_index=True)

In [40]:
df

Unnamed: 0,label,text
0,what is the best cryptocurrency,The best crypto credit...
1,what is the best cryptocurrency,Best Crypto & Blockchain Right Now
2,what is the best cryptocurrency,"There is no single best cryptocurrency, but th..."
3,what is the best cryptocurrency,The best cryptocurrency exchanges are those th...
4,what is the best cryptocurrency,Another one of the easiest cryptocurrencies to...
...,...,...
12195,will dogecoin ever be like bitcoin,"Moreover, take every altcoin claim with a grai..."
12196,will dogecoin ever be like bitcoin,Bitcoin now has a market cap of $1.1 trillion ...
12197,will dogecoin ever be like bitcoin,"Dogecoin will fail because it’s inflationary, ..."
12198,will dogecoin ever be like bitcoin,"Yes you can mine Dogecoin. Like Bitcoin, Dogec..."


In [41]:
df = df[['label', 'text']]
df.to_csv('data.csv')