# PARSING etherscan.io

I made an attempt to parse more full information about user's transactions.

In [13]:
%load_ext autoreload
%autoreload 2
%aimport

import requests
from bs4 import BeautifulSoup
from user_agent import generate_user_agent
from application.load_transaction_data import load_ether_data, load_token_data
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import random
import time
import warnings

warnings.filterwarnings("ignore")

transactions_df = load_ether_data()\
    .query("Status != 'Error(0)'")
token_df = load_token_data()

transactions_transfers_df = pd.read_pickle("data/transactions_transfers_df")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Modules to reload:
all-except-skipped

Modules to skip:



In [79]:
def export_transfer_list(transfer: str):
    # try:
    #     amount = re.findall(r"(\d+\.\d+|\d+\,\d+)", transfer)[0]
    # except IndexError:
    #     amount = re.findall(r"\d+", transfer)[0]
    amount = float(transfer.split("TRANSFER")[1].split("Ether")[0].strip(" ").replace(",", ""))
    from_ = re.findall(r"From (.*?) To", transfer)[0]
    to_ = transfer.split("  ")[-1]
    return amount, from_, to_


def extract_token_info(token_url):
    result_dict = {}

    # Parse from and to
    try:
        result_dict['from'] =  token_url.find("span", class_='hash-tag text-truncate hash-tag-custom-from tooltip-address').text
    except Exception as e:
        result_dict['from'] = e
    try:
        result_dict['to'] = token_url.find("span", class_='hash-tag text-truncate hash-tag-custom-to tooltip-address').text
    except Exception as e:
        result_dict['to'] = e

    # Parse USD value. Check, where or not does USD value exist on this page?
    try:
        token1_usd = re.findall(r"\(\$.*?\)", str(token_url))[0]
        token1_usd = token1_usd.strip("()").replace(" (", "").replace("$", "")
        result_dict['usd'] = float(token1_usd.replace(",", "_"))
        result_dict['amount'] = token_url.find("span", 'data-toggle'=='tooltip').text
        result_dict['amount'] = float(result_dict['amount'].replace(",", ""))
    except (AttributeError, IndexError) as e:
        result_dict['usd'] = 'empty'
        result_dict['amount'] = token_url.find_all("span", class_='mr-1')[-1].text
        result_dict['amount'] = float(result_dict['amount'].replace(",", ""))

    # Parse token hash in order to parse full info later
    try:
        result_dict['token_hash'] = token_url.find("a").get("href").split("?")[0].split("/")[2]
    except Exception as e:
        result_dict['token_hash'] = e
    return result_dict

def extract_tokens_info(transaction_hash):
    req = requests.get(f"https://etherscan.io/tx/{transaction_hash}",
                   headers={"User-Agent": generate_user_agent()})
    content = req.content
    soup = BeautifulSoup(content)
    tokens_df = []
    bad_tokens = []
    errors = []
    all_tokens = soup.find_all("li", class_='media align-items-baseline mb-2')
    for index, tokens in enumerate(all_tokens):
        try:
            tokens_df.append(
                pd.DataFrame.from_dict(extract_token_info(all_tokens[index]), orient='index').T
            )
        except Exception as e:
            bad_tokens.append(index)
            errors.append(e)
    try:

        tokens_df = pd.concat(tokens_df)
    except ValueError:
        tokens_df = pd.DataFrame(data=[transaction_hash], columns=['Txhash'])

    tokens_df.index = np.arange(tokens_df.shape[0])
    # TRANSFER parsing
    transfer_text_list = []
    transfers = soup.find_all("li", class_='media align-items-baseline')
    if len(transfers) > 0:
        for transf in transfers:
            transfer_text_list.append(transf.text.replace("\xa0", ""))

        transfer_df = pd.DataFrame(list(map(export_transfer_list, transfer_text_list)), columns=['transfer_amount', 'transfer_from', 'transfer_to'])
        tokens_df = pd.concat([tokens_df, transfer_df], axis=1)

    tokens_df['Txhash'] = transaction_hash
    return tokens_df

all_tokens_df = []
transactions_list = set(transactions_df['Txhash'])
for tx_hash in tqdm(transactions_list):
    try:
        time.sleep(random.uniform(0, 1))
        all_tokens_df.append(extract_tokens_info(tx_hash))
    except Exception as e:
        print(e)
        print(f"Error on {tx_hash}")
    pd.concat(all_tokens_df).to_pickle("transactions_transfers_df")
# extract_tokens_info("0xdf08f1b6048a3c151737d797c5a5da5892cff66dfdfda319d4a260f358196c4b")

 15%|█▍        | 1110/7418 [19:24<2:01:13,  1.15s/it]

could not convert string to float: '1 wei From Wrapped'
Error on 0x90c67a86473a56f543d0984e569e60f4b70ea85a623342c822bc0b4d5821ce7e


 73%|███████▎  | 5398/7418 [2:30:23<1:22:55,  2.46s/it] 

could not convert string to float: '90 wei From SushiSwap: Router To  0x99fd1378ca799ed6772fe7bcdc9b30b389518962'
Error on 0x428c9d824d6aa0c2599ee48be07592efd37c5eac7ea66a8bce9678f537dbdd3d


100%|██████████| 7418/7418 [4:00:43<00:00,  1.95s/it]  


## Parsing token hash

In [9]:
transactions_transfers_df['token_hash'].dropna().map(len)

0    42
1    42
0    42
1    42
2    42
     ..
0    42
1    42
2    42
3    42
4    42
Name: token_hash, Length: 15740, dtype: int64

In [14]:
all_token_hash = set(transactions_transfers_df['token_hash'].dropna())
def parsing_tokens(token_hash):
    token_dict = {}
    token_dict['token_hash'] = token_hash
    req = requests.get(f"https://etherscan.io/token/{token_hash}",
                   headers={"User-Agent": generate_user_agent()})
    content = req.content
    soup = BeautifulSoup(content)
    try:
        token_dict['token_short_name'] = soup.find("span", class_='text-secondary small').text
    except Exception as e:
        token_dict['token_short_name'] = e
    try:
        token_dict['token_long_name'] = soup.find("a", class_='mb-1 mb-sm-0 u-label u-label--xs u-label--info').text
    except Exception as e:
        token_dict['token_long_name'] = e
    return token_dict

token_names_df = []
for token in tqdm(all_token_hash):
    time.sleep(random.uniform(0, 1))
    try:
        token_names_df.append(
                pd.DataFrame.from_dict(parsing_tokens(token), orient='index').T)
    except Exception as e:
        token_names_df.append(pd.DataFrame(data=[token], columns=['Txhash']))
token_names_df = pd.concat(token_names_df)
token_names_df.to_pickle("data/token_hash_name")

100%|██████████| 245/245 [03:34<00:00,  1.14it/s]


In [15]:
token_names_df

Unnamed: 0,token_hash,token_short_name,token_long_name
0,0xc7fd8dcee4697ceef5a2fd4608a7bd6a94c77480,Cream CRV,Cream.Finance
0,0x8cc94ccd0f3841a468184aca3cc478d2148e1757,Curve mUSD Pool yVault,'NoneType' object has no attribute 'text'
0,0xc11b1268c1a384e55c48c2391d8d480264a3a7f4,Compound Wrapped BTC,Compound
0,0x3041cbd36888becc7bbcbc0045e3b1f144466f5f,Uniswap USDC/USDT LP,'NoneType' object has no attribute 'text'
0,0x44fbebd2f576670a6c33f6fc0b00aa8c5753b322,Cream USD Coin,Cream.Finance
...,...,...,...
0,0x998ceb152a42a3eac1f555b1e911642bebf00fad,FARM_cDAI+cUSDC,'NoneType' object has no attribute 'text'
0,0x89ab32156e46f46d02ade3fecbe5fc4243b9aaed,pNetwork Token,'NoneType' object has no attribute 'text'
0,0x8e595470ed749b85c6f7669de83eae304c2ec68f,Yearn Dai Stablecoin,Cream.Finance
0,0x10dd17ecfc86101eab956e0a443cab3e9c62d9b4,Balancer Pool Token,'NoneType' object has no attribute 'text'
