In [1]:
import pandas as pd
import minsearch
from openai import OpenAI
from dotenv import load_dotenv
import hashlib
import json

In [2]:
import os
#load_dotenv() UNABLE TO FETCH THE KEY FROM .ENVRC FILE.

In [4]:
def generate_document_id(doc):
    combined = f"{doc['category']}-{doc['term']}-{doc['description'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
df = pd.read_csv('../data/crypto_linguist_data.csv')

cleaning data

In [6]:
df.columns = df.columns.str.lower()
df.drop_duplicates(subset='term').shape
#df.to_csv('../data/data.csv',index=False)

(301, 3)

In [7]:
df.head()

Unnamed: 0,term,category,description
0,34% Attack,Security and Attacks,A 34% attack occurs when an entity controls mo...
1,51% Attack,Security and Attacks,A 51% attack occurs when an entity controls mo...
2,Aave,Decentralized Finance (DeFi) and Protocols,Aave is a decentralized lending protocol that ...
3,Aavegotchi,Decentralized Finance (DeFi) and Protocols,Aavegotchi is a blockchain-based game that com...
4,Airdrop,Miscellaneous and Emerging Technologies,An airdrop is a distribution strategy where fr...


In [8]:
documents = df.to_dict(orient='records')
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [9]:
documents[0]

{'term': '34% Attack',
 'category': 'Security and Attacks',
 'description': 'A 34% attack occurs when an entity controls more than 34% of a blockchain’s network power or stake, potentially manipulating consensus mechanisms, especially in Proof-of-Stake networks. This level of control can disrupt the network, validate fraudulent transactions, or halt consensus, compromising the integrity of the blockchain.',
 'id': '4c1e419c'}

In [10]:
index = minsearch.Index(text_fields=["term", "category", "description"],
                        keyword_fields=[]
)

index.fit(documents)

<minsearch.Index at 0x74a6f2b1acf0>

In [11]:
query = 'I would like to know about DeFi?'

In [12]:
index.search(query=query,num_results=5)

[{'term': 'DeFi',
  'category': 'Decentralized Finance (DeFi) and Protocols',
  'description': 'Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries. DeFi platforms offer various services like lending, borrowing, and trading through smart contracts on decentralized networks, aiming to democratize access to financial products and services.',
  'id': '1c88180b'},
 {'term': 'InstaDApp',
  'category': 'Decentralized Finance (DeFi) and Protocols',
  'description': 'InstaDApp is a DeFi management platform that aggregates multiple DeFi protocols into a single interface, simplifying complex strategies like borrowing, lending, and leveraging. It provides users with tools to maximize DeFi opportunities without needing to interact with individual protocols. It integrates platforms like Maker, Compound, and Aave.',
  'id': '3a4e7d53'},
 {'term': 'Cream Finance',
  'category': 'Decentralized Finance (DeFi) and Proto

In [13]:
def search(q):
    #boost = {'term': 3.0, 'category': 0.5}

    results = index.search(
        query=q,
        #boost_dict=boost,
        num_results=5
    )
    return results

In [14]:
prompt_template = """
You're a crypto linguist. Answer the QUESTION based on the CONTEXT from our specially curated database.
Use only the facts from the CONTEXT when answering the QUESTION and do not add anything else. 
If context doesn't provide relevant information, mention that you need to
update the knowledge base to answer the user question.

QUESTION: {question}
CONTEXT: 
{context}
""".strip()

entry_template = """
id:{id}
term:{term}
category:{category}
description:{description}
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [15]:
prompt = build_prompt(query,search(query))
print(prompt)

You're a crypto linguist. Answer the QUESTION based on the CONTEXT from our specially curated database.
Use only the facts from the CONTEXT when answering the QUESTION and do not add anything else. 
If context doesn't provide relevant information, mention that you need to
update the knowledge base to answer the user question.

QUESTION: I would like to know about DeFi?
CONTEXT: 
id:1c88180b
term:DeFi
category:Decentralized Finance (DeFi) and Protocols
description:Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries. DeFi platforms offer various services like lending, borrowing, and trading through smart contracts on decentralized networks, aiming to democratize access to financial products and services.

id:3a4e7d53
term:InstaDApp
category:Decentralized Finance (DeFi) and Protocols
description:InstaDApp is a DeFi management platform that aggregates multiple DeFi protocols into a single interface, simpli

In [25]:
def llm(prompt):
    client = OpenAI()
    response = client.chat.completions.create(model='gpt-4o-mini',
               messages = [{"role":"user","content":prompt}])
    return response.choices[0].message.content

In [26]:
def rag_response(q):
    context = search(q)
    prompt = build_prompt(q, context)
    answer = llm(prompt)
    return answer

In [17]:
query = 'explain the difference between proof of work and proof of stake?'
answer = rag_response(query)
answer

"Proof of Work (PoW) is a consensus algorithm where miners solve complex cryptographic puzzles to validate transactions and create new blocks, requiring significant computational power and energy. This approach ensures network security by making attacks costly and resource-intensive.\n\nIn contrast, Proof of Stake (PoS) is a consensus mechanism where validators are chosen to create new blocks based on the amount of cryptocurrency they hold and are willing to stake as collateral. PoS is more energy-efficient compared to PoW and incentivizes validators to act in the network's best interest.\n\nIn summary, the key differences are: PoW relies on computational power and energy consumption, while PoS is based on the amount of cryptocurrency held and staked, making it more energy-efficient."

In [18]:
query = 'What are the potential consequences for users if a 51% attack takes place on a cryptocurrency network?'
answer = rag_response(query)
print(answer)

If a 51% attack takes place on a cryptocurrency network, the potential consequences for users include the ability for the attacker to alter the blockchain's history, double-spend coins, and disrupt network operations, ultimately compromising the network's integrity.


with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [19]:
from elasticsearch import Elasticsearch

In [20]:
es_client = Elasticsearch('http://localhost:9200')

In [21]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "term": {"type": "text"},
            "category": {"type": "text"},
            "description": {"type": "text"}
        }
    }
}

index_name = "crypto-linguist"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'crypto-linguist'})

In [22]:
documents[0]

{'term': '34% Attack',
 'category': 'Security and Attacks',
 'description': 'A 34% attack occurs when an entity controls more than 34% of a blockchain’s network power or stake, potentially manipulating consensus mechanisms, especially in Proof-of-Stake networks. This level of control can disrupt the network, validate fraudulent transactions, or halt consensus, compromising the integrity of the blockchain.',
 'id': '4c1e419c'}

In [23]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/302 [00:00<?, ?it/s]

In [24]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["term^3", "description", "category"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [30]:
def rag_es(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag_es(query)

"If a 51% attack occurs on a cryptocurrency network, the potential consequences for users include the ability for the attacking entity to alter the blockchain's history, double-spend coins, or disrupt network operations. This could ultimately compromise the network's integrity and affect users' trust and security in the network."