In [22]:
# Import relevant packages.

import os, json
import itertools
from typing import Any, Tuple, List, Set, Dict, Optional

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [80]:
save_dir = "../data"
MIN_CONF: float = 0.2

In [24]:
clusters_file: str = os.path.join(save_dir, f'heuristic_4_clusters.json')
tx2addr_file: str = os.path.join(save_dir, f'heuristic_4_tx2addr.json')
addr2conf_file: str = os.path.join(save_dir, f'heuristic_4_addr2conf.json')
address_file: str = os.path.join(save_dir, f'heuristic_4_address_sets.json')
tornado_addrs_file: str = os.path.join(save_dir, f'tornado_pools.json')
# metadata_file: str = os.path.join(args.save_dir, f'heuristic_4_txs_metadata.csv')

In [25]:
def load_data(root) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    withdraw_df: pd.DataFrame = pd.read_csv(
        os.path.join(root, 'lighter_complete_withdraw_txs.csv'))

    # Change recipient_address to lowercase.
    withdraw_df['recipient_address'] = withdraw_df['recipient_address'].str.lower()
    
    # Change block_timestamp field to be a timestamp object.
    withdraw_df['block_timestamp'] = withdraw_df['block_timestamp'].apply(pd.Timestamp)

    deposit_df: pd.DataFrame = pd.read_csv(
        os.path.join(root, 'lighter_complete_deposit_txs.csv'))
    
    # Change block_timestamp field to be a timestamp object.
    deposit_df['block_timestamp'] = deposit_df['block_timestamp'].apply(pd.Timestamp)

    # Load TCash pool addresses data
    with open(tornado_addrs_file) as json_file:
        tornado_addresses = json.load(json_file)
        
    withdraw_counts: Dict[str, int] = \
        withdraw_df.recipient_address.value_counts().to_dict()
    deposit_counts: Dict[str, int] = \
        deposit_df.from_address.value_counts().to_dict()

    withdraw_counts: pd.Series = \
        withdraw_df.recipient_address.apply(lambda x: withdraw_counts[x])
    deposit_counts: pd.Series = \
        deposit_df.from_address.apply(lambda x: deposit_counts[x])
    
    withdraw_df['tx_counts'] = withdraw_counts
    deposit_df['tx_counts'] = deposit_counts

    # Remove withdraw and deposit transactions with only 1 or 2 transactions
    withdraw_df: pd.DataFrame = withdraw_df[withdraw_df.tx_counts > 2]
    deposit_df: pd.DataFrame = deposit_df[deposit_df.tx_counts > 2]

    return withdraw_df, deposit_df, tornado_addresses

In [129]:
def same_num_of_transactions_heuristic(withdraw_tx: pd.Series, 
withdraw_df: pd.DataFrame, 
    addr2deposit: Dict[str, str], 
    tornado_addresses: Dict[str, int],
) -> Tuple[bool, Optional[Dict[str, Any]]]:
    
    # Calculate the number of withdrawals of the address 
    # from the withdraw_tx given as input.
    withdraw_counts, withdraw_set = get_num_of_withdraws(
        withdraw_tx, withdraw_df, tornado_addresses)

    # remove entries that only give to one pool, we are taking 
    # multi-denominational deposits only
    if len(withdraw_counts) == 1:
        return (False, None)

    withdraw_addr: str = withdraw_tx.from_address
    withdraw_txs: List[str] = list(itertools.chain(*list(withdraw_set.values())))
    withdraw_tx2addr = dict(zip(withdraw_txs, 
        [withdraw_addr for _ in range(len(withdraw_txs))]))

    # Based on withdraw_counts, the set of the addresses that have 
    # the same number of deposits is calculated.
    conf_map = get_same_or_more_num_of_deposits(withdraw_counts, addr2deposit)
    # deposit_addrs: List[str] = list(set(addresses))
    deposit_addrs : List[str] = list(conf_map.keys())
    deposit_confs: List[float] = [conf_map[addr] for addr in deposit_addrs]

    deposit_txs: List[str] = []
    deposit_tx2addr: Dict[str, str] = {}

    for address in deposit_addrs:
        deposit_set: Dict[str, List[str]] = addr2deposit[address]
        assert set(withdraw_set.keys()) == set(deposit_set.keys()), \
            "Set of keys do not match."

        address_conf: float = conf_map[address]

        if address_conf >= MIN_CONF:
            # list of all txs for withdraws and deposits regardless of pool
            cur_deposit_txs: List[str] = list(itertools.chain(*list(deposit_set.values())))

            # dictionary from transaction to address
            cur_deposit_tx2addr = dict(zip(cur_deposit_txs, 
                [address for _ in range(len(cur_deposit_txs))]))
            deposit_txs.extend(cur_deposit_txs)
            deposit_tx2addr.update(cur_deposit_tx2addr)

    if len(deposit_addrs) > 0:
        privacy_score: float = 1. - 1. / len(deposit_addrs)
        response_dict: Dict[str, Any] = dict(
            withdraw_txs = withdraw_txs,
            deposit_txs = deposit_txs,
            deposit_confs = deposit_confs,
            withdraw_addr = withdraw_addr,
            deposit_addrs = deposit_addrs,
            withdraw_tx2addr = withdraw_tx2addr,
            deposit_tx2addr = deposit_tx2addr,
            privacy_score = privacy_score,
        )
        return (True, response_dict)

    return (False, None)

In [140]:
withdraw_tx = withdraw_txs.iloc[1]
same_num_of_transactions_heuristic(withdraw_tx, withdraw_txs, addr2deposit, tornado_addresses)

(True,
 {'withdraw_txs': ['0x6ca53e294a130d9bfca81a349b4a82a6cc9fee3925353562a3e30a4265ef7174',
   '0x4250ccb54a1ba509407740a9f36a6101aec3aeb61a0f8a75f241d442d89a123c',
   '0x3199144ba98021e523c5ec1e4f4dcf64446d69b825da7251d2d2a0a794038d0d'],
  'deposit_txs': ['0x563dfe970efdda6527328f1e001c1dd413d4f21f99a9258dfc0fd16bd67e430c',
   '0xd55f21d1d6f2ea3df4a16389695e1c87c007e54932bc58af89979ea20d18581c',
   '0xc5ddf149530bd3940b715414a9127be0b81b697aef06135e59fd52d60a1a0955'],
  'deposit_confs': [1.0],
  'withdraw_addr': '0xb77562124be8ac967cf7fc24573fe252aa39d95d',
  'deposit_addrs': ['0xad801483efced876060c34c7ed735282ad5f2021'],
  'withdraw_tx2addr': {'0x6ca53e294a130d9bfca81a349b4a82a6cc9fee3925353562a3e30a4265ef7174': '0xb77562124be8ac967cf7fc24573fe252aa39d95d',
   '0x4250ccb54a1ba509407740a9f36a6101aec3aeb61a0f8a75f241d442d89a123c': '0xb77562124be8ac967cf7fc24573fe252aa39d95d',
   '0x3199144ba98021e523c5ec1e4f4dcf64446d69b825da7251d2d2a0a794038d0d': '0xb77562124be8ac967cf7fc24573fe2

In [120]:
def get_num_of_withdraws(
    withdraw_tx: pd.Series, 
    withdraw_df: pd.DataFrame, 
    tornado_addresses: Dict[str, str],
) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
    """
    Given a particular withdraw transaction and the withdraw transactions 
    DataFrame, gets the total withdraws the address made in each pool. It 
    is returned as a dictionary with the pools as the keys and the number 
    of withdraws as the values.
    """
    cur_withdraw_pool: str = tornado_addresses[withdraw_tx.tornado_cash_address]

    withdraw_txs: Dict[str, List[str]] = {
        tornado_addresses[withdraw_tx.tornado_cash_address]: []}

    subset_df: pd.DataFrame = withdraw_df[
        (withdraw_df.recipient_address == withdraw_tx.recipient_address) & 
        (withdraw_df.block_timestamp <= withdraw_tx.block_timestamp) & 
        (withdraw_df.hash != withdraw_tx.hash)
    ]
    subset_df.loc[:, ('tornado_pool')] = subset_df.tornado_cash_address.map(
        lambda x: tornado_addresses[x])

    withdraw_count: pd.DataFrame = subset_df.groupby('tornado_pool').size()
    withdraw_count: Dict[str, int] = withdraw_count.to_dict()

    withdraw_txs: pd.DataFrame = subset_df.groupby('tornado_pool')['hash'].apply(list)
    withdraw_txs: Dict[str, List[str]] = withdraw_txs.to_dict()

    # add 1 for current address
    if cur_withdraw_pool in withdraw_count:
        withdraw_count[cur_withdraw_pool] += 1
        withdraw_txs[cur_withdraw_pool].append(withdraw_tx.hash)
    else:
        withdraw_count[cur_withdraw_pool] = 1
        withdraw_txs[cur_withdraw_pool] = [withdraw_tx.hash]

    return withdraw_count, withdraw_txs

In [137]:
def get_same_or_more_num_of_deposits(
    withdraw_counts: pd.DataFrame, 
    addr2deposit: Dict[str, Dict[str, List[str]]], 
) -> Tuple[List[str], Dict[str, float]]:
    
    conf_mapping: Dict[str, float] = dict()
    for address, deposits in addr2deposit.items():
        if compare_transactions(withdraw_counts, deposits):
            num_diff: int = diff_transactions(withdraw_counts, deposits)
            if num_diff == 0:
                conf: float = 1.0
            else:
                conf: float = 1. / num_diff
            conf_mapping[address] = conf

    # addresses: List[str] = list(conf_mapping.keys())
    return conf_mapping
    # result: Dict[str, Any] = dict(
    #     filter( lambda elem: compare_transactions(withdraw_counts, elem[1]), 
    #             addr2deposit.items()))
    # return list(result.keys())

### Docs: compare_transactions
Given two dictionaries, withdraw_dict and deposit_dict representing 
the total deposits and withdraws made by an address to each TCash pool, 
respectively, compares if the set of keys of both are equal and when 
they are, checks if all values in the deposit dictionary are equal or 
greater than each of the corresponding values of the withdraw 
dicionary. If this is the case, returns True, if not, False.

In [30]:
def compare_transactions(
    withdraw_counts_dict: pd.DataFrame, 
    deposit_dict: pd.DataFrame,
) -> bool:
    if set(withdraw_counts_dict.keys()) != set(deposit_dict.keys()):
        return False
    for currency in withdraw_counts_dict.keys():
        if not (len(deposit_dict[currency]) >= withdraw_counts_dict[currency]):
            return False
    return True

def diff_transactions(
    withdraw_counts_dict: pd.DataFrame, 
    deposit_dict: pd.DataFrame,
) -> int:
    num_diff: int = 0
    for currency in withdraw_counts_dict.keys():
        num_diff += abs(len(deposit_dict[currency]) - withdraw_counts_dict[currency])
    return num_diff

In [90]:
def get_address_deposits(
    deposit_df: pd.DataFrame,
    tornado_addresses: Dict[str, int],
) -> Dict[str, Dict[str, List[str]]]:
    
    """
    Given the deposit transactions DataFrame, returns a 
    dictionary with every address to the transactions they
    deposited.
    Example:
    {
        '0x16e54b35d789832440ab47ae765e6a8098280676': 
            {
                '0.1 ETH': [...],
                '100 USDT': [...],
            },
        '0x35dd029618f4e1835008da21fd98850c776453f0': {
            '0.1 ETH': [...],
        },
        '0xe906442c11b85acbc58eccb253b9a55a20b80a56': {
            '0.1 ETH': [...],
        },
        '0xaf301de836c81deb8dff9dc22745e23c476155b2': {
            '1 ETH': [...],
            '0.1 ETH': [...],
            '10 ETH': [...],
        },
    }
    """
    counts_df: pd.DataFrame = pd.DataFrame(
        deposit_df[['from_address', 'tornado_cash_address']].value_counts()
    ).rename(columns={0: "count"})
    
    addr2deposit: Dict[str, str] = {}
    print('building map from address to deposits made by address...')
    #pbar = tqdm(total=len(counts_df))
    for row in tqdm(counts_df.itertuples(), total=len(counts_df), mininterval=3):
        deposit_set: pd.Series = deposit_df[
            (deposit_df.from_address == row.Index[0]) &
            (deposit_df.tornado_cash_address == row.Index[1])
        ].hash
        deposit_set: Set[str] = list(set(deposit_set))

        if row.Index[0] in addr2deposit.keys():
            addr2deposit[row.Index[0]][
                tornado_addresses[row.Index[1]]] = deposit_set
        else:
            addr2deposit[row.Index[0]] = {
                tornado_addresses[row.Index[1]]: deposit_set}

        #pbar.update()
    #pbar.close()

    return addr2deposit

In [91]:
def get_same_num_transactions_clusters(
    deposit_df: pd.DataFrame, 
    withdraw_df: pd.DataFrame, 
    tornado_addresses: dict,
    data_dir: str,
):
    """
    Same Number of Transactions Heuristic.
    If there are multiple (say 12) deposit transactions coming from 
    a deposit address and later there are 12 withdraw transactions 
    to the same withdraw address, *then we can link all these deposit 
    transactions to the withdraw transactions*. 
    """
    #tornado_addresses: Dict[str, int] = \
    #    dict(zip(tornado_df.address, tornado_df.tags))

    cached_addr2deposit: str =  os.path.join(data_dir, 'same_num_txs_addr2deposit.json')
    if os.path.isfile(cached_addr2deposit):
        print('Found cached deposit mapping: loading...')
        # addr2deposit: Dict[str, Dict[str, List[str]]] = from_json(cached_addr2deposit)
        with open(cached_addr2deposit, 'r') as json_file:
            addr2deposit = json.load(json_file)
    else:
        addr2deposit = get_address_deposits(deposit_df, tornado_addresses)
        with open(cached_addr2deposit, "w") as outfile:
            json.dump(addr2deposit, outfile, indent=4)
        # to_json(addr2deposit, cached_addr2deposit)

    tx_clusters: List[Set[str]] = []
    tx2addr: Dict[str, str] = {}
    address_sets: List[Set[str]] = []
    addr2conf: Dict[Tuple[str, str], float] = {}

    print('Processing withdraws')
    # pbar = tqdm(total=len(withdraw_df))

    for withdraw_row in tqdm(withdraw_df.itertuples(), total=len(withdraw_df), mininterval=5):
        results = same_num_of_transactions_heuristic(
            withdraw_row, withdraw_df, addr2deposit, tornado_addresses)

        if results[0]:
            response_dict = results[1]

            # populate graph with known transactions
            withdraw_txs: List[str] = response_dict['withdraw_txs']
            deposit_txs: List[str] = response_dict['deposit_txs']
            withdraw_tx2addr: Dict[str, str] = response_dict['withdraw_tx2addr']
            deposit_tx2addr: Dict[str, str] = response_dict['deposit_tx2addr']
            tx_cluster: Set[str] = set(withdraw_txs + deposit_txs)

            withdraw_addr: str = response_dict['withdraw_addr']
            deposit_addrs: List[str] = response_dict['deposit_addrs']
            deposit_confs: List[float] = response_dict['deposit_confs']

            for deposit_addr, deposit_conf in zip(deposit_addrs, deposit_confs):
                if withdraw_addr != deposit_addr:
                    address_sets.append([withdraw_addr, deposit_addr])
                    addr2conf[(withdraw_addr, deposit_addr)] = deposit_conf

            tx2addr.update(withdraw_tx2addr)
            tx2addr.update(deposit_tx2addr)
            tx_clusters.append(tx_cluster)

        #pbar.update()
    #pbar.close()

    return tx_clusters, address_sets, tx2addr, addr2conf

In [None]:
withdraw_txs, deposit_txs, tornado_addresses = load_data(save_dir)

In [141]:
# main
clusters, address_sets, tx2addr, addr2conf = get_same_num_transactions_clusters(
    deposit_txs, withdraw_txs, tornado_addresses, save_dir)

Found cached deposit mapping: loading...
Processing withdraws


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53485/53485 [12:11<00:00, 73.11it/s]


In [144]:
tx2addr

{'0x6ca53e294a130d9bfca81a349b4a82a6cc9fee3925353562a3e30a4265ef7174': '0xb77562124be8ac967cf7fc24573fe252aa39d95d',
 '0x4250ccb54a1ba509407740a9f36a6101aec3aeb61a0f8a75f241d442d89a123c': '0xb77562124be8ac967cf7fc24573fe252aa39d95d',
 '0x3199144ba98021e523c5ec1e4f4dcf64446d69b825da7251d2d2a0a794038d0d': '0xb77562124be8ac967cf7fc24573fe252aa39d95d',
 '0x563dfe970efdda6527328f1e001c1dd413d4f21f99a9258dfc0fd16bd67e430c': '0xad801483efced876060c34c7ed735282ad5f2021',
 '0xd55f21d1d6f2ea3df4a16389695e1c87c007e54932bc58af89979ea20d18581c': '0xad801483efced876060c34c7ed735282ad5f2021',
 '0xc5ddf149530bd3940b715414a9127be0b81b697aef06135e59fd52d60a1a0955': '0xad801483efced876060c34c7ed735282ad5f2021',
 '0xc88addb452a33385ce282b973ccc7a72c9631d4fa3e46803eb758f6cbab68f69': '0x41a28335c5075c81502a97cebad597f28728a815',
 '0x49822ce5e04ed6984f7bfd71b79b817ea6da95e7a3a6984c5275e806f203323e': '0x7d3bb46c78b0c4949639ce34896bfd875b97ad08',
 '0x314d5c655dec318f303bd2d5d024650adbf0f95bdadf4f27f5055d24f4f0