In [4]:
import pandas as pd
import minsearch
from elasticsearch import Elasticsearch
from openai import OpenAI
import json
import os

In [5]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

In [6]:
with open('../data/documents-with-ids.json','rb') as docs:
    documents = json.load(docs)

documents[0]

{'term': '34% Attack',
 'category': 'Security and Attacks',
 'description': 'A 34% attack occurs when an entity controls more than 34% of a blockchain’s network power or stake, potentially manipulating consensus mechanisms, especially in Proof-of-Stake networks. This level of control can disrupt the network, validate fraudulent transactions, or halt consensus, compromising the integrity of the blockchain.',
 'id': '4c1e419c'}

In [32]:
#!pip install sentence_transformers==2.7.0
#!pipenv lock

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Locking[0m [33m[packages][0m dependencies...[0m
[?25lBuilding requirements[33m...[0m
[2KResolving dependencies[33m...[0m
[2K✔ Success! Locking packages...
[2K[32m⠇[0m Locking packages...
[1A[2KLocking[0m [33m[dev-packages][0m dependencies...[0m
[1mUpdated Pipfile.lock (90a0c9b7b24e9c0f645a9dd5a950e16fc921aa053027386e737f08a8176d1173)![0m


In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
len(model.encode("This is a simple sentence"))

768

In [12]:
operations = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["desc_vector"] = model.encode(doc["description"]).tolist()
    operations.append(doc)

In [13]:
index = minsearch.Index(text_fields=["term", "category", "description"],
                        keyword_fields=[]
)

index.fit(documents)

<minsearch.Index at 0x7fd4d7de9a00>

In [14]:
query = 'I would like to know about DeFi?'

In [15]:
index.search(query=query,num_results=5)

[{'term': 'DeFi',
  'category': 'Decentralized Finance (DeFi) and Protocols',
  'description': 'Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries. DeFi platforms offer various services like lending, borrowing, and trading through smart contracts on decentralized networks, aiming to democratize access to financial products and services.',
  'id': '1c88180b',
  'desc_vector': [0.014122126623988152,
   -0.0009202584624290466,
   -0.060102980583906174,
   -0.048643533140420914,
   0.011550026014447212,
   -0.005385324824601412,
   0.09019949287176132,
   0.028560083359479904,
   0.02741130441427231,
   0.05955938622355461,
   -0.004467854276299477,
   -0.019035760313272476,
   0.06759203225374222,
   0.10602173954248428,
   0.011459575966000557,
   -0.029335638508200645,
   0.037961553782224655,
   0.03133511543273926,
   -0.016510142013430595,
   -0.01851525716483593,
   0.0343853123486042,
   -0.036320

In [16]:
def search(q):
    #boost = {'term': 3.0, 'category': 0.5}
    results = index.search(
        query=q,
        #boost_dict=boost,
        num_results=5
    )
    return results

In [17]:
prompt_template = """
You're a crypto linguist. Answer the QUESTION based on the CONTEXT from our specially curated database.
Use only the facts from the CONTEXT when answering the QUESTION and do not add anything else. 
If context doesn't provide relevant information, mention that you need to
update the knowledge base to answer the user question.

QUESTION: {question}
CONTEXT: 
{context}
""".strip()

entry_template = """
id:{id}
term:{term}
category:{category}
description:{description}
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [18]:
prompt = build_prompt(query,search(query))
print(prompt)

You're a crypto linguist. Answer the QUESTION based on the CONTEXT from our specially curated database.
Use only the facts from the CONTEXT when answering the QUESTION and do not add anything else. 
If context doesn't provide relevant information, mention that you need to
update the knowledge base to answer the user question.

QUESTION: I would like to know about DeFi?
CONTEXT: 
id:1c88180b
term:DeFi
category:Decentralized Finance (DeFi) and Protocols
description:Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries. DeFi platforms offer various services like lending, borrowing, and trading through smart contracts on decentralized networks, aiming to democratize access to financial products and services.

id:3a4e7d53
term:InstaDApp
category:Decentralized Finance (DeFi) and Protocols
description:InstaDApp is a DeFi management platform that aggregates multiple DeFi protocols into a single interface, simpli

In [19]:
def llm(prompt):
    client = OpenAI(api_key = api_key)
    response = client.chat.completions.create(model='gpt-4o-mini',
               messages = [{"role":"user","content":prompt}])
    return response.choices[0].message.content

In [20]:
def rag_response(q):
    context = search(q)
    prompt = build_prompt(q, context)
    answer = llm(prompt)
    return answer

In [21]:
es_client = Elasticsearch('http://localhost:9200')

In [22]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "term": {"type": "text"},
            "category": {"type": "text"},
            "description": {"type": "text"},
            "desc_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

index_name = "crypto-linguist"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'crypto-linguist'})

In [23]:
from tqdm.auto import tqdm
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [24]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["term^3", "description", "category"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [25]:
def rag_es(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
rag_es(query)

'Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries. DeFi platforms offer various services like lending, borrowing, and trading through smart contracts on decentralized networks, aiming to democratize access to financial products and services.'

In [27]:
search_term = "which is better : centralized or decentralized?"
vector_search_term = model.encode(search_term)

query = {
    "field": "desc_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [28]:
res = es_client.search(index=index_name, knn=query, source=["term", "category", "description"])
res["hits"]["hits"]

[{'_index': 'crypto-linguist',
  '_id': 'tNsh95EBVrWd7PydKTdM',
  '_score': 0.76883304,
  '_source': {'description': 'Decentralization is the distribution of control and decision-making across a network or system, rather than relying on a single central authority. In blockchain and cryptocurrency, decentralization enhances security, transparency, and resilience by spreading power among multiple participants or nodes.',
   'term': 'Decentralization',
   'category': 'Governance and Community'}},
 {'_index': 'crypto-linguist',
  '_id': 'tdsh95EBVrWd7PydKTdh',
  '_score': 0.70300704,
  '_source': {'description': 'A Decentralized Autonomous Organization (DAO) is a blockchain-based entity governed by smart contracts and decentralized decision-making. It operates without central control, with decisions made through member voting using governance tokens, enabling collective management and transparent, automated governance.',
   'term': 'Decentralized Autonomous Organization (DAO)',
   'categor

In [29]:
knn_query = {
    "field": "desc_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [30]:
# Included "knn" in the search query (to perform a semantic search) along with the filter  
response = es_client.search(
    index=index_name,
    query={
        "match": {"category": "Blockchain Technology and Concepts"},
    },
    knn=knn_query,
    size=5
)

In [31]:
response['hits']['hits']['

[{'_index': 'crypto-linguist',
  '_id': 'fNsh95EBVrWd7PydIje9',
  '_score': 5.130733,
  '_source': {'term': 'Atomic Cross-Chain Swap',
   'category': 'Blockchain Technology and Concepts',
   'description': 'An atomic cross-chain swap is a smart contract-based technology that allows users to exchange cryptocurrencies across different blockchains without needing a third party. The "atomic" aspect ensures that the swap is either fully completed on both chains or not at all, preventing partial or failed transactions.',
   'id': 'a663cd4a',
   'desc_vector': [0.041870132088661194,
    -0.09208741039037704,
    0.018606960773468018,
    -0.018376681953668594,
    -0.044388268142938614,
    0.020891493186354637,
    0.009213759563863277,
    -0.022954480722546577,
    -0.028337344527244568,
    -0.02467108890414238,
    -0.054577942937612534,
    0.07934655994176865,
    0.000572738004848361,
    0.037030018866062164,
    0.0024878066033124924,
    0.006616340484470129,
    0.0546738132834434