# Data collection

**[Johnnatan Messias](https://johnnatan-messias.github.io/), March 2025**

This notebook is used to gathered data from dYdX, 1inch, Compound, Uniswap, SushiSwap, ENS, and others from the Ethereum Mainnet.
See also the two repositories below

1. https://github.com/johnnatan-messias/ethereum-crawler/blob/main/1-dataset.ipynb
1. https://github.com/johnnatan-messias/chainlink-data-feed-crawler


In [1]:
import gzip
import pickle
from tqdm import tqdm
import requests as re

In [2]:
import sys
import os
code_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "src"))

sys.path.append(code_dir)

In [3]:
from ethereum import (get_contract,
                      get_all_events_from_contract,
                      get_batch_intervals, get_transactions)
from utils import Utils

In [4]:
contract_settings = Utils.load_contract_settings(chain="mainnet")
print(contract_settings.keys())

dict_keys(['ens', 'ens_register', 'dydx', 'dydx-merkle-distributor', 'compound', 'uniswap', 'sushi', '1inch', 'tornadocash', 'worldcoin', 'lido', 'lido-token', 'arkham-token', 'arkham-airdrop'])


In [5]:
data_dir = os.path.realpath(os.path.join(
    os.getcwd(), "..", '..', "data")) + os.sep

print(data_dir)

/Users/johnnatanmessias/research-projects/data/


In [6]:
path_dir = '../data/'
path_plots = '../plots/'
os.makedirs(path_dir, exist_ok=True)
os.makedirs(path_plots, exist_ok=True)

In [7]:
last_block = 21_800_000

# Data gathering from archive node


In [8]:
from web3 import Web3

In [9]:
eth_node = os.environ['ETHEREUM_NODE_URL']
etherscan_api_key = os.environ["ETHERSCAN_API_KEY"]

adapter = re.adapters.HTTPAdapter(pool_connections=20, pool_maxsize=20)
session = re.Session()
session.mount('http://', adapter)
session.mount('https://', adapter)

w3 = Web3(Web3.HTTPProvider(eth_node, session=session,
          request_kwargs={'timeout': 60}))

print("Is connected to Ethereum node: ", w3.is_connected())
print("The most recent block is: ", w3.eth.block_number)

Is connected to Ethereum node:  True
The most recent block is:  22174062


In [10]:
contracts = {}
for contract in contract_settings:
    abi_contract = contract_settings[contract].get('abi-proxy')
    contracts[contract] = get_contract(
        w3, contract_settings[contract]['address'], abi_contract_address=abi_contract)

In [11]:
for contract in contracts:
    print(contract)
    for event in contracts[contract].events:
        print('\t', event.event_name)

ens
	 Approval
	 Claim
	 DelegateChanged
	 DelegateVotesChanged
	 MerkleRootChanged
	 OwnershipTransferred
	 Transfer
ens_register
	 NameRegistered
	 NameRenewed
	 OwnershipTransferred
dydx
	 Approval
	 DelegateChanged
	 DelegatedPowerChanged
	 OwnershipTransferred
	 Transfer
	 TransferAllowlistUpdated
	 TransfersRestrictedBeforeUpdated
dydx-merkle-distributor
	 AlwaysAllowClaimForUpdated
	 EpochScheduleUpdated
	 IpfsUpdatePeriodUpdated
	 IpnsNameUpdated
	 RewardsClaimed
	 RewardsOracleChanged
	 RewardsParametersUpdated
	 RoleAdminChanged
	 RoleGranted
	 RoleRevoked
	 RootProposed
	 RootUpdated
	 RootUpdatesPaused
	 RootUpdatesUnpaused
compound
	 Approval
	 DelegateChanged
	 DelegateVotesChanged
	 Transfer
uniswap
	 Approval
	 DelegateChanged
	 DelegateVotesChanged
	 MinterChanged
	 Transfer
sushi
	 Approval
	 DelegateChanged
	 DelegateVotesChanged
	 OwnershipTransferred
	 Transfer
1inch
	 Approval
	 OwnershipTransferred
	 Transfer
tornadocash
	 Allowed
	 Approval
	 Disallowed
	 Paused

In [None]:
batch_size = 1000
max_workers = 10
for contract_name in contract_settings:
    print(contract_name)
    events = get_all_events_from_contract(contracts[contract_name],
                                          start_block=contract_settings[contract_name]['start'],
                                          end_block=last_block,
                                          batch_size=batch_size,
                                          max_workers=max_workers,
                                          events=None)
    with gzip.open(path_dir + 'events_' + contract_name + '.pkl.gz', 'wb') as f:
        pickle.dump(events, f)

In [10]:
block_min = float('inf')
block_max = 0
for contract_name in contract_settings:
    block_min = min(block_min, contract_settings[contract_name]['start'])
    block_max = max(block_max, last_block)
print("The min block and max block are: ", block_min, block_max)

The min block and max block are:  9601359 21800000


In [24]:
contract_names = sorted(['_'.join(filename.split('_')[1:])[:-7] for filename in os.listdir(
    path_dir) if filename.startswith("events_") and filename.endswith(".pkl.gz")])
contract_names

['1inch',
 'compound',
 'dydx',
 'ens',
 'ens_register',
 'lido',
 'lido-token',
 'sushi',
 'tornadocash',
 'uniswap',
 'worldcoin']

In [25]:
def load_dataset(contract_name):
    with gzip.open(path_dir + 'events_' + contract_name + '.pkl.gz', 'rb') as f:
        events = pickle.load(f)
    return events

In [26]:
txs = list()
for contract_name in tqdm(contract_settings):
    events = load_dataset(contract_name)
    for event in events:
        txs += list(map(lambda tx: tx['transactionHash'].hex().lower(), events[event]))
print("There are {} transactions".format(len(txs)))
txs = set(txs)
print("There are {} unique transactions".format(len(txs)))

100%|██████████| 11/11 [03:37<00:00, 19.81s/it]


There are 26311951 transactions
There are 16440146 unique transactions


In [None]:
# c = 1
# for index, thumb in tqdm(get_batch_intervals(0, len(txs), 200_000)):
#     txs_data = get_transactions(w3, txs[index:thumb])
#     with gzip.open(path_dir + 'txs/txs_' + str(c) + '.pkl.gz', 'wb') as f:
#         pickle.dump(txs_data, f)
#     c += 1