# Ingestion

In [2]:
import pandas as pd
import minsearch
from openai import OpenAI
import os
from dotenv import load_dotenv

In [3]:
import json

In [4]:
with open('../data/documents-with-ids.json','rb') as f:
    documents = json.load(f)

In [5]:
documents[0]

{'term': '34% Attack',
 'category': 'Security and Attacks',
 'description': 'A 34% attack occurs when an entity controls more than 34% of a blockchain’s network power or stake, potentially manipulating consensus mechanisms, especially in Proof-of-Stake networks. This level of control can disrupt the network, validate fraudulent transactions, or halt consensus, compromising the integrity of the blockchain.',
 'id': '4c1e419c'}

In [6]:
index = minsearch.Index(text_fields=["term", "category", "description","id"],
                        keyword_fields=[]
)

index.fit(documents)

<minsearch.Index at 0x79a667a55d00>

# RAG flow

In [7]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

In [8]:
client = OpenAI(api_key=api_key)

In [22]:
def search(q):
    boost = {'term': 1.0, 'category': 1.0}

    results = index.search(
        query=q,
        boost_dict=boost,
        num_results=10
    )
    return results

prompt_template = """
You're a crypto linguist. Answer the QUESTION based on the CONTEXT from our specially curated database.
Use only the facts from the CONTEXT when answering the QUESTION and do not add anything else. 
If context doesn't provide relevant information, mention that you need to
update the knowledge base to answer the user question.

QUESTION: {question}
CONTEXT: 
{context}
""".strip()

entry_template = """
id:{id}
term:{term}
category:{category}
description:{description}
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt,model='gpt-4o-mini'):
    response = client.chat.completions.create(model=model,
               messages = [{"role":"user","content":prompt}])
    return response.choices[0].message.content


def rag_response(q,model='gpt-4o-mini'):
    context = search(q)
    prompt = build_prompt(q, context)
    answer = llm(prompt,model=model)
    return answer

In [9]:
query = 'explain the difference between proof of work and proof of stake?'
answer = rag_response(query)
answer

'Proof of Work (PoW) is a consensus algorithm where miners solve complex cryptographic puzzles to validate transactions and create new blocks, requiring significant computational power and energy. This mechanism ensures network security and prevents double-spending by making attacks costly and resource-intensive.\n\nIn contrast, Proof of Stake (PoS) is a consensus mechanism where validators are chosen to create new blocks based on the amount of cryptocurrency they hold and are willing to "stake" as collateral. PoS is energy-efficient compared to PoW and incentivizes validators to act in the network’s best interest.\n\nIn summary, the main differences are:\n\n- **Mechanism**: PoW relies on computational power and puzzle-solving, while PoS relies on the amount of cryptocurrency staked.\n- **Energy Consumption**: PoW requires significant energy, whereas PoS is more energy-efficient.'

# Retrieval Evaluation

In [11]:
df_eval = pd.read_csv('../data/crypto-guru-ground-truth-data.csv')

In [12]:
df_eval.columns = ['question','id']

In [13]:
ground_truth = df_eval.to_dict(orient='records')

In [14]:
ground_truth[0]

{'question': 'What happens during a 34% attack in a blockchain network?',
 'id': '4c1e419c'}

In [14]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def minsearch_search(q):
    boost = {}

    results = index.search(
        query=q['question'],
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results
    
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [28]:
from tqdm.auto import tqdm
evaluate(ground_truth, lambda q: minsearch_search(q))

# Finding the best parameters for best retrieval

In [16]:
df_valid = df_eval[:100]
df_test = df_eval[100:]

In [17]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [18]:
gt_valid = df_valid.to_dict(orient='records')

In [19]:
def ms_search(q, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=q['question'],
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [20]:
param_ranges = {
    'term': (0.0, 3.0),
    'category': (0.0, 3.0),
    'descriptions':(0.0,3.0)
}

def objective(boost_params):
    def search_function(q):
        return ms_search(q, boost_params)

    results = evaluate(gt_valid, search_function)
    return results['mrr']

In [21]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 334.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 339.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 354.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 273.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 271.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 262.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 342.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 328.90it/s]
100%|███████████████████████████

({'term': 0.39530280957136266,
  'category': 0.15201292701371005,
  'descriptions': 0.2296734857432715},
 0.9372619047619049)

In [23]:
# Evaluating entire ground truth dataset with optimized parameters on minsearch boosting
def minsearch_optimized(q):
    boost={'term':0.39,
           'category':0.15,
           'descriptions':0.23
          }

    results = index.search(
        query=q['question'],
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [25]:
evaluate(ground_truth, lambda q: minsearch_optimized(q))

100%|████████████████████████████████████████████████████████████████████████████████| 903/903 [00:02<00:00, 329.01it/s]


{'hit_rate': 0.9955703211517165, 'mrr': 0.9572905131044668}

# RAG Evaluation : gpt-4o-mini

In [15]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [16]:
len(ground_truth)

903

In [17]:
ground_truth[0]

{'question': 'What happens during a 34% attack in a blockchain network?',
 'id': '4c1e419c'}

In [18]:
question = ground_truth[0]['question']

In [19]:
rag_response(question)

"During a 34% attack in a blockchain network, an entity that controls more than 34% of the network's power or stake can manipulate consensus mechanisms. This level of control allows the entity to disrupt the network, validate fraudulent transactions, or halt consensus, ultimately compromising the integrity of the blockchain."

In [20]:
df_sample = df_eval.sample(n=100, random_state=1)

In [21]:
sample = df_sample.to_dict(orient='records')

In [37]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag_response(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [06:12<00:00,  3.72s/it]


In [38]:
df_result = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_result['id'] = df_result.record.apply(lambda d: d['id'])
df_result['question'] = df_result.record.apply(lambda d: d['question'])

df_result['relevance'] = df_result.evaluation.apply(lambda d: d['Relevance'])
df_result['explanation'] = df_result.evaluation.apply(lambda d: d['Explanation'])

del df_result['record']
del df_result['evaluation']

In [40]:
df_result.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.86
NON_RELEVANT       0.09
PARTLY_RELEVANT    0.05
Name: proportion, dtype: float64

In [42]:
df_result[df_result['relevance']=='NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
8,The context provided does not include any info...,dd4a43c6,How does the mechanism of 'Rebase' affect the ...,NON_RELEVANT,The generated answer does not address the ques...
14,I need to update the knowledge base to answer ...,791e80c7,Who created Polkadot and what were the main is...,NON_RELEVANT,The generated answer does not provide any info...
27,The context does not provide specific examples...,9a4b625f,What are some examples of security techniques ...,NON_RELEVANT,The generated answer does not address the ques...
38,I need to update the knowledge base to answer ...,9fccc278,What is the purpose of the Tron blockchain pla...,NON_RELEVANT,The generated answer does not address the ques...
50,The context provided does not contain informat...,30901c73,How does timestamping contribute to the securi...,NON_RELEVANT,The generated answer explicitly states that it...
63,The context does not provide relevant informat...,92adce1a,How does Enigma ensure the confidentiality of ...,NON_RELEVANT,The generated answer indicates that there is n...
78,The context does not provide specific informat...,e560504b,How does Trust Wallet ensure the security of m...,NON_RELEVANT,The generated answer explicitly states that th...
94,The context provided does not contain specific...,74e61101,In what way do smart contracts contribute to t...,NON_RELEVANT,The generated answer does not address the ques...
95,The context provided does not include any spec...,c7775f58,What role does community engagement play in th...,NON_RELEVANT,The generated answer fails to address the ques...


# RAG Evaluation : gpt-4o

In [29]:
# Cosine Similarity : can't perform as we don't have original answers 

In [24]:
rag_response(question,model='gpt-4o')

"During a 34% attack in a blockchain network, an entity that controls more than 34% of the network's power or stake can potentially manipulate the consensus mechanisms, particularly in Proof-of-Stake (PoS) networks. Such a level of control allows the entity to disrupt the network by validating fraudulent transactions or halting consensus. This compromises the integrity of the blockchain by enabling actions that can undermine its security and decentralization."

In [30]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag_response(question,model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations_gpt4o.append((record, answer_llm, evaluation))

100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [05:08<00:00,  3.09s/it]


In [31]:
df_result_gpt4o = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_result_gpt4o['id'] = df_result_gpt4o.record.apply(lambda d: d['id'])
df_result_gpt4o['question'] = df_result_gpt4o.record.apply(lambda d: d['question'])

df_result_gpt4o['relevance'] = df_result_gpt4o.evaluation.apply(lambda d: d['Relevance'])
df_result_gpt4o['explanation'] = df_result_gpt4o.evaluation.apply(lambda d: d['Explanation'])

del df_result_gpt4o['record']
del df_result_gpt4o['evaluation']

In [32]:
df_result_gpt4o.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.87
NON_RELEVANT       0.07
PARTLY_RELEVANT    0.06
Name: proportion, dtype: float64

In [33]:
df_result_gpt4o[df_result_gpt4o['relevance']=='NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
8,I need to update the knowledge base to answer ...,dd4a43c6,How does the mechanism of 'Rebase' affect the ...,NON_RELEVANT,The generated answer does not address the ques...
14,I need to update the knowledge base to answer ...,791e80c7,Who created Polkadot and what were the main is...,NON_RELEVANT,The generated answer does not address the ques...
38,I need to update the knowledge base to answer ...,9fccc278,What is the purpose of the Tron blockchain pla...,NON_RELEVANT,The generated answer does not provide any info...
50,I need to update the knowledge base to answer ...,30901c73,How does timestamping contribute to the securi...,NON_RELEVANT,The generated answer does not address the ques...
63,I need to update the knowledge base to answer ...,92adce1a,How does Enigma ensure the confidentiality of ...,NON_RELEVANT,The generated answer does not address the ques...
94,I need to update the knowledge base to answer ...,74e61101,In what way do smart contracts contribute to t...,NON_RELEVANT,The generated answer does not address the ques...
95,The context provided does not cover the role o...,c7775f58,What role does community engagement play in th...,NON_RELEVANT,The generated answer explicitly states that it...
