# PARSING etherscan.io

I made an attempt to parse more full information about user's transactions.

In [7]:
%load_ext autoreload
%autoreload 2
%aimport

import requests
from bs4 import BeautifulSoup
from application.load_transaction_data import load_ether_data, load_token_data
import pandas as pd
import re
from tqdm import tqdm
import random
import time
import warnings

warnings.filterwarnings("ignore")

transactions_df = load_ether_data()
token_df = load_token_data()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Modules to reload:
all-except-skipped

Modules to skip:



# First parsing attempt

In [56]:
# Experiments
from user_agent import generate_user_agent
transaction_hash = "0x381753da98c0dd8abb5bd6aaaeba411d10ea0b708d72cb1964073ebd93bb586d"
req = requests.get(f"https://etherscan.io/tx/{transaction_hash}",
                   headers={"User-Agent": generate_user_agent()},
                   # proxies=proxies
                   )
content = req.content
soup = BeautifulSoup(content)
all_tokens = soup.find_all("li", class_='media align-items-baseline mb-2')
token_url = all_tokens[]
extract_token_info(token_url)

IndexError: list index out of range

In [72]:
# Check transfers parsed
transaction_hash = '0x381753da98c0dd8abb5bd6aaaeba411d10ea0b708d72cb1964073ebd93bb586d'
req = requests.get(f"https://etherscan.io/tx/{transaction_hash}",
               headers={"User-Agent": generate_user_agent()})
content = req.content
soup = BeautifulSoup(content)
tokens_df = []
bad_tokens = []
errors = []
all_tokens = soup.find_all("li", class_='media align-items-baseline mb-2')
for index, tokens in enumerate(all_tokens):
    try:
        tokens_df.append(
            pd.DataFrame.from_dict(extract_token_info(all_tokens[index]), orient='index').T
        )
    except Exception as e:
        bad_tokens.append(index)
        errors.append(e)
try:

    tokens_df = pd.concat(tokens_df)
except ValueError:
    tokens_df = pd.DataFrame(data=[transaction_hash], columns=['Txhash'])

# TRANSFER parsing
transfer_text_list = []
transfers = soup.find_all("li", class_='media align-items-baseline')
if len(transfers) > 0:
    for transf in transfers:
        transfer_text_list.append(transf.text.replace("\xa0", ""))
    try:
        tokens_df.at[0, 'TRANSFER'] = transfer_text_list
    except ValueError:
        tokens_df['TRANSFER'] = pd.Series(transfer_text_list)

tokens_df['Txhash'] = transaction_hash

In [73]:
tokens_df

Unnamed: 0,from,to,usd,amount,token_hash,TRANSFER,Txhash
0,0x99fd1378ca799ed6772fe7bcdc9b30b389518962,Uniswap V2: gKIMCHI,empty,99195.132212,0x4b7dfae2567181e54776337c840e142acb42aa1f,TRANSFER 37.082295734421439726 Ether From Unis...,0x381753da98c0dd8abb5bd6aaaeba411d10ea0b708d72...
0,Uniswap V2: Router 2,Uniswap V2: gKIMCHI,13962.57,37.082296,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,TRANSFER 37.082295734421439726 Ether From Unis...,0x381753da98c0dd8abb5bd6aaaeba411d10ea0b708d72...
0,Null Address: 0x000…000,0x99fd1378ca799ed6772fe7bcdc9b30b389518962,empty,1906.785085,0xcb525da7f6e8e990ba2c31ba09e1e3078fa0ace2,TRANSFER 37.082295734421439726 Ether From Unis...,0x381753da98c0dd8abb5bd6aaaeba411d10ea0b708d72...


In [70]:
tokens_df['TRANSFER'] = pd.Series(transfer_text_list)

In [71]:
tokens_df

Unnamed: 0,from,to,usd,amount,token_hash,TRANSFER
0,0x99fd1378ca799ed6772fe7bcdc9b30b389518962,Uniswap V2: gKIMCHI,empty,99195.132212,0x4b7dfae2567181e54776337c840e142acb42aa1f,TRANSFER 37.082295734421439726 Ether From Unis...
1,Uniswap V2: Router 2,Uniswap V2: gKIMCHI,13962.57,37.082296,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,TRANSFER 0.221741158794180602 Ether From Unisw...
2,Null Address: 0x000…000,0x99fd1378ca799ed6772fe7bcdc9b30b389518962,empty,1906.785085,0xcb525da7f6e8e990ba2c31ba09e1e3078fa0ace2,


In [None]:
def extract_token_info(token_url):
    result_dict = {}

    # Parse from and to
    try:
        result_dict['from'] =  token_url.find("span", class_='hash-tag text-truncate hash-tag-custom-from tooltip-address').text
    except Exception as e:
        result_dict['from'] = e
    try:
        result_dict['to'] = token_url.find("span", class_='hash-tag text-truncate hash-tag-custom-to tooltip-address').text
    except Exception as e:
        result_dict['to'] = e

    # Parse USD value. Check, where or not does USD value exist on this page?
    try:
        token1_usd = re.findall(r"\(\$.*?\)", str(token_url))[0]
        token1_usd = token1_usd.strip("()").replace(" (", "").replace("$", "")
        result_dict['usd'] = float(token1_usd.replace(",", "_"))
        result_dict['amount'] = token_url.find("span", 'data-toggle'=='tooltip').text
        result_dict['amount'] = float(result_dict['amount'].replace(",", ""))
    except (AttributeError, IndexError) as e:
        result_dict['usd'] = 'empty'
        result_dict['amount'] = token_url.find_all("span", class_='mr-1')[-1].text
        result_dict['amount'] = float(result_dict['amount'].replace(",", ""))

    # Parse token hash in order to parse full info later
    try:
        result_dict['token_hash'] = token_url.find("a").get("href").split("?")[0].split("/")[2]
    except Exception as e:
        result_dict['token_hash'] = e
    return result_dict

def extract_tokens_info(transaction_hash):
    req = requests.get(f"https://etherscan.io/tx/{transaction_hash}",
                   headers={"User-Agent": generate_user_agent()})
    content = req.content
    soup = BeautifulSoup(content)
    tokens_df = []
    bad_tokens = []
    errors = []
    all_tokens = soup.find_all("li", class_='media align-items-baseline mb-2')
    for index, tokens in enumerate(all_tokens):
        try:
            tokens_df.append(
                pd.DataFrame.from_dict(extract_token_info(all_tokens[index]), orient='index').T
            )
        except Exception as e:
            bad_tokens.append(index)
            errors.append(e)
    try:

        tokens_df = pd.concat(tokens_df)
    except ValueError:
        tokens_df = pd.DataFrame(data=[transaction_hash], columns=['Txhash'])

    # TRANSFER parsing
    transfer_text_list = []
    transfers = soup.find_all("li", class_='media align-items-baseline')
    if len(transfers) > 0:
        for transf in transfers:
            transfer_text_list.append(transf.text.replace("\xa0", ""))
            try:
                tokens_df.at[0, 'TRANSFER'] = transfer_text_list
            except ValueError:
                tokens_df['TRANSFER'] = pd.Series(transfer_text_list)

    tokens_df['Txhash'] = transaction_hash
    return tokens_df

all_tokens_df = []
# for tx_hash in tqdm(transactions_df['Txhash'].unique().tolist()):
for tx_hash in tqdm(transactions_parsed_user_df.query("amount_prep.isna()")['Txhash'].unique().tolist()):
    try:
        time.sleep(random.uniform(0, 2))
        all_tokens_df.append(extract_tokens_info(tx_hash))
    except Exception as e:
        print(e)
        print(f"Error on {tx_hash}")
    # pd.concat(all_tokens_df).to_pickle("transactions_parsed_df")
    pd.concat(all_tokens_df).to_pickle("transactions_parsed_df_new")

  4%|▍         | 57/1273 [01:24<32:18,  1.59s/it]

In [8]:
transactions_parsed_df = pd.read_pickle("transactions_parsed_df")
transactions_parsed_user_df = transactions_parsed_df[(transactions_parsed_df["from"] == '0x99fd1378ca799ed6772fe7bcdc9b30b389518962') |
                                                        (transactions_parsed_df['to'] == '0x99fd1378ca799ed6772fe7bcdc9b30b389518962')]
transactions_parsed_user_df['amount_prep'] = transactions_parsed_user_df['amount'].apply(lambda x: x.replace(",", "") if type(x) == str else x)
transactions_parsed_user_df['amount_prep'] = pd.to_numeric(transactions_parsed_user_df['amount_prep'], errors='coerce')

In [27]:
# Preprocess transfers
transactions_parsed_user_df['transfer_amount'] = transactions_parsed_user_df['TRANSFER'].apply(lambda x: re.findall(r"\d\.\d+", x)[0] if pd.isna(x) is False else x)
transactions_parsed_user_df['transfer_from'] = transactions_parsed_user_df['TRANSFER'].apply(lambda x: re.findall(r"From (.*?) To", x)[0] if pd.isna(x) is False else x)
transactions_parsed_user_df['transfer_to'] = transactions_parsed_user_df['TRANSFER'].apply(lambda x: x.split("  ")[-1] if pd.isna(x) is False else x)

['0x8cd9354db0841c1b306da298173a59d28212d95ab73c380203b8c9b28aceecce',
 '0x95a97d5cf7b07bc3c662b4921aa4f3618ba3482871519fb52a692d8efdd502fe',
 '0x64d7f4dfb76f6577a83434bcfabff4de629b8886fa1040de044edddb90914616',
 '0xf06916046a01055d5ef6284b033fc416c3182b7c514f0760c066398ddba36b33',
 '0xb138b2402d25b9b4ba1f8c61f93366d2fb90949ffd239c6ba44c0072bfde006f',
 '0x2abfbbac49c6e1e29cd4260758f336b1abc0590fb5a29fc83758ff9713af99dc',
 '0xb5bbdc4447b644090e891a2dfe03dbabd44daa455a6852a19ce74fc072ae3aee',
 '0x4617ec3b7d20f544aa7f6735554ac586981ffc3154605d18630abbf2c35d4f2d',
 '0x40bbf262d1b57ffc0676e6905d66f01d093089e725cc0a45495519f0d7a5e6d9',
 '0x732aa7aaf106df8d6da1c733c5ad3a84564f86e9de4ed5a08033061a3d3b60a8',
 '0xc777cb6fb387a7620b982fa4cafa57a93d5abd0461da919dc775606b6a5a87fc',
 '0xcdc7d7a1afdfef0584e161ff5b10019800c40c3f3a38425ebafadecabdbfa2b9',
 '0x6b0b076a5ddf8cb389f26201ba3ad401c3321bf11d1b111a40eb8904ae680193',
 '0x068830c1aa6744090b61953df8dd40615981d36e0db6968d094a7948fc7a28ec',
 '0x89

## Parsing token hash

In [67]:
all_token_hash = [i for i in set(all_tokens_user_df['token_hash'].unique()) if type(i) == str and len(i) == 42]
def parsing_tokens(token_hash):
    token_dict = {}
    token_dict['token_hash'] = token_hash
    req = requests.get(f"https://etherscan.io/token/{token_hash}",
                   headers={"User-Agent": generate_user_agent()})
    content = req.content
    soup = BeautifulSoup(content)
    try:
        token_dict['token_short_name'] = soup.find("span", class_='text-secondary small').text
    except Exception as e:
        token_dict['token_short_name'] = e
    try:
        token_dict['token_long_name'] = soup.find("a", class_='mb-1 mb-sm-0 u-label u-label--xs u-label--info').text
    except Exception as e:
        token_dict['token_long_name'] = e
    return token_dict

token_names_df = []
for token in tqdm(all_token_hash):
    time.sleep(random.uniform(0, 2))
    try:
        token_names_df.append(
                pd.DataFrame.from_dict(parsing_tokens(token), orient='index').T)
    except Exception as e:
        token_names_df.append(pd.DataFrame(data=[token], columns=['Txhash']))
token_names_df = pd.concat(token_names_df)
token_names_df.to_pickle("token_hash_name")

# Add token name
all_tokens_df_2 = all_tokens_df_2.merge(token_names_df[['token_hash', 'token_short_name']], how='left', on='token_hash')

100%|██████████| 241/241 [05:56<00:00,  1.48s/it]
