In [11]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

In [3]:
load_dotenv('.envrc') #file with OPENAI_API_KEY
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

In [16]:
with open('documents-with-ids.json','rb') as f:
    documents = json.load(f)

In [17]:
documents[0]

{'term': '34% Attack',
 'category': 'Security and Attacks',
 'description': 'A 34% attack occurs when an entity controls more than 34% of a blockchain’s network power or stake, potentially manipulating consensus mechanisms, especially in Proof-of-Stake networks. This level of control can disrupt the network, validate fraudulent transactions, or halt consensus, compromising the integrity of the blockchain.',
 'id': '4c1e419c'}

In [18]:
prompt_template = """
You emulate a novice in cryptocurrencies and blockchain space and a user of our crypto-guru-ama-bot.
Formulate 3 questions this user might ask which can be answered using the description.
Make the questions specific to the term.
The record description should be enough to answer the questions. The questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:
id:{id}
term: {term}
category: {category}
description: {description}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", "question3"]}}
""".strip()

In [6]:
prompt = prompt_template.format(**documents[0])

In [7]:
print(prompt)

You emulate a novice in cryptocurrencies and blockchain space and a user of our crypto-guru-ama-bot.
Formulate 3 questions this user might ask which can be answered using the description.
Make the questions specific to the term.
The record description should be enough to answer the questions. The questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:
id:0
term: 34% Attack
category: Security and Attacks
description: A 34% attack occurs when an entity controls more than 34% of a blockchain’s network power or stake, potentially manipulating consensus mechanisms, especially in Proof-of-Stake networks. This level of control can disrupt the network, validate fraudulent transactions, or halt consensus, compromising the integrity of the blockchain.

Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", "question3"]}


In [8]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
questions = llm(prompt)

In [24]:
#import json
json.loads(questions)

{'questions': ['What happens during a 34% attack on a blockchain?',
  "How does controlling 34% of a blockchain's network power affect consensus mechanisms?",
  'What are the potential consequences of a 34% attack in Proof-of-Stake networks?']}

In [19]:
from tqdm.auto import tqdm

In [20]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [21]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/302 [00:00<?, ?it/s]

In [25]:
results['4c1e419c']

'{"questions": ["What happens during a 34% attack in a blockchain network?", "How can an entity disrupt a blockchain with a 34% attack?", "Why is controlling more than 34% of a network power significant in Proof-of-Stake systems?"]}'

In [26]:
parsed_result = {}

for doc_id, json_questions in results.items():
    parsed_result[doc_id] = json.loads(json_questions)

In [40]:
with open('ground-truth-results.json', 'wt') as f_out:
    json.dump(parsed_result, f_out, indent=2)

In [27]:
parsed_result

{'4c1e419c': {'questions': ['What happens during a 34% attack in a blockchain network?',
   'How can an entity disrupt a blockchain with a 34% attack?',
   'Why is controlling more than 34% of a network power significant in Proof-of-Stake systems?']},
 '078e9f04': {'questions': ['What happens when an entity achieves a 51% attack on a blockchain network?',
   "How can controlling over 50% of mining power affect a blockchain's integrity?",
   'What are the potential consequences of a 51% attack for coin transactions?']},
 '9afbd80e': {'questions': ['What is the primary function of the Aave protocol?',
   'Can you explain what a flash loan is in the context of Aave?',
   'How does Aave enable users to earn interest on their deposited cryptocurrencies?']},
 'f427b58e': {'questions': ['What are Aavegotchis and how do they relate to decentralized finance?',
   'How can users earn rewards in the Aavegotchi game?',
   'What unique features can users customize in their Aavegotchis?']},
 '1ab064

In [33]:
parsed_result['970cb970']['questions']

["What is zkSync and how does it relate to Ethereum's scalability?",
 'Can you explain how zkSync maintains security and decentralization while processing transactions off-chain?',
 'How does zkSync achieve low-cost transactions compared to the Ethereum mainnet?']

In [35]:
final_results = []

for doc_id, questions in parsed_result.items():
    for q in questions['questions']:
        final_results.append((q, doc_id))

In [36]:
final_results

[('What happens during a 34% attack in a blockchain network?', '4c1e419c'),
 ('How can an entity disrupt a blockchain with a 34% attack?', '4c1e419c'),
 ('Why is controlling more than 34% of a network power significant in Proof-of-Stake systems?',
  '4c1e419c'),
 ('What happens when an entity achieves a 51% attack on a blockchain network?',
  '078e9f04'),
 ("How can controlling over 50% of mining power affect a blockchain's integrity?",
  '078e9f04'),
 ('What are the potential consequences of a 51% attack for coin transactions?',
  '078e9f04'),
 ('What is the primary function of the Aave protocol?', '9afbd80e'),
 ('Can you explain what a flash loan is in the context of Aave?', '9afbd80e'),
 ('How does Aave enable users to earn interest on their deposited cryptocurrencies?',
  '9afbd80e'),
 ('What are Aavegotchis and how do they relate to decentralized finance?',
  'f427b58e'),
 ('How can users earn rewards in the Aavegotchi game?', 'f427b58e'),
 ('What unique features can users customi

In [37]:
df = pd.DataFrame(final_results, columns=['question', 'document'])

In [42]:
df.head()

Unnamed: 0,question,document
0,What happens during a 34% attack in a blockcha...,4c1e419c
1,How can an entity disrupt a blockchain with a ...,4c1e419c
2,Why is controlling more than 34% of a network ...,4c1e419c
3,What happens when an entity achieves a 51% att...,078e9f04
4,How can controlling over 50% of mining power a...,078e9f04


In [43]:
df.to_csv('crypto-guru-ground-truth-data.csv',index=False)