In [1]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup as bs

In [None]:
# Set directory paths
data_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "data", "labels"))

# Create directories if they don't exist
os.makedirs(data_dir, exist_ok=True)

In [3]:
def clean_number(value: str) -> float:
    """Cleans a string and converts it to float. Returns NaN if conversion fails."""
    value = re.sub(r'[^\d.\-]', '',
                   value)  # Remove non-numeric characters except . and -
    try:
        return float(value)
    except ValueError:
        return float('nan')


def parse_token_table(soup):
    table = soup.find("table")
    if not table:
        return []
    rows = table.find("tbody").find_all("tr")

    data = []
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 6:
            continue

        token_rank = cols[0].text.strip()
        token_symbol = cols[1].find(
            "span").text.strip() if cols[1].find("span") else ""
        token_name = cols[1].find(
            "div").text.strip() if cols[1].find("div") else ""
        token_address = cols[1].find("a")["href"].split("/")[-1]
        price = re.sub(
            r'\s+', ' ', cols[2].text.strip().replace("\n", " ")).strip()
        change_24h = clean_number(cols[3].text.strip())
        volume_24h = clean_number(cols[4].text.strip())
        market_cap = clean_number(cols[5].text.strip())

        data.append({
            "Rank": token_rank,
            "Name": token_name.replace("\n", " "),
            "Symbol": token_symbol.replace("(", "").replace(")", ""),
            "Address": token_address.lower(),
            "Price": price,
            "24h Change": change_24h,
            "24h Volume": volume_24h,
            "Market Cap": market_cap
        })

    return data

In [4]:
all_tokens = []
filenames = sorted([filename for filename in os.listdir(
    os.path.join(data_dir, "basescan")) if filename.endswith('.html')])

for filename in filenames:
    print(f"Processing {filename}...")
    filedir = os.path.join(data_dir, "basescan", filename)
    with open(filedir) as f:
        html = f.read()
        soup = bs(html, "html.parser")
        tokens = parse_token_table(soup)
        all_tokens.extend(tokens)

df = pd.DataFrame(all_tokens)
df.to_csv(os.path.join(data_dir, "basescan",
          "basescan-labels.csv"), index=False)
df.head()

Processing basescan-page-1.html...
Processing basescan-page-2.html...
Processing basescan-page-3.html...
Processing basescan-page-4.html...
Processing basescan-page-5.html...


Unnamed: 0,Rank,Name,Symbol,Address,Price,24h Change,24h Volume,Market Cap
0,1,USDC (USDC),USDC,0x833589fcd6edb6e08f4c7c32d4f71b54bda02913,$0.9999 0.000330 ETH,,9091264000.0,63489000000.0
1,2,Wrapped BTC (WBTC),WBTC,0x0555e30da8f98308edb960aa94c0db47230d2b9c,"$122,373.00 40.385662 ETH",3.91,234146100.0,15767960000.0
2,3,Wrapped liquid staked Ether 2.0 (wstETH),wstETH,0xc1cba3fcea344f92d9239c08c0568f6f2f0ee452,"$3,673.51 1.212336 ETH",2.72,17456620.0,12672360000.0
3,4,Wrapped eETH (weETH.base),weETH.base,0x04c0599ae5a44757c0af6f9ec3b93da8976c150a,"$3,250.53 1.072743 ETH",2.46,4013222.0,8235197000.0
4,5,Wrapped Ether (WETH),WETH,0x4200000000000000000000000000000000000006,"$3,030.82 1.000234 ETH",2.31,337599000.0,7519547000.0


In [5]:
df.set_index("Address")['Name'].to_json()

'{"0x833589fcd6edb6e08f4c7c32d4f71b54bda02913":"USDC (USDC)","0x0555e30da8f98308edb960aa94c0db47230d2b9c":"Wrapped BTC (WBTC)","0xc1cba3fcea344f92d9239c08c0568f6f2f0ee452":"Wrapped liquid staked Ether 2.0 (wstETH)","0x04c0599ae5a44757c0af6f9ec3b93da8976c150a":"Wrapped eETH (weETH.base)","0x4200000000000000000000000000000000000006":"Wrapped Ether (WETH)","0x820c137fa70c8691f0e44dc420a5e53c168921dc":"USDS Stablecoin (USDS)","0xcbb7c0000ab88b473b1f5afd9ef808440eed33bf":"Coinbase Wrapped BTC (cbBTC)","0x50c5725949a6f0c72e6c4a641f24049a917db0cb":"Dai Stablecoin (DAI)","0x5d3a1ff2b6bab83b63cd9ad0787074081a52ef34":"USDe (USDe)","0x63706e401c06ac8513145b7687a14804d17f814b":"Aave Token (AAVE)","0x211cc4dd073734da055fbf44a2b4667d5e5fe5d2":"Staked USDe (sUSDe)","0x58538e6a46e07434d7e7375bc268d3cb839c0133":"ENA (ENA)","0x5875eee11cf8398102fdad704c9e96607675467a":"Savings USDS (sUSDS)","0x3128a0f7f0ea68e7b7c9b00afa7e41045828e858":"Spark USDC Vault (sUSDC)","0xc27468b12ffa6d714b1b5fbc87ef403f38b82ad