## Collecting labelled addresses for ETH

Fetches and labels ethereum addresses from Etherscan.com.

In [1]:
import bs4
import requests
import csv
import time
import math

In [2]:
def append_list_as_row(file_name, list_of_elem):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = csv.writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)

def write_list_as_row(file_name, list_of_elem):
    # Truncates file to empty then writes row
    with open(file_name, 'w+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = csv.writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)

In [3]:
#generate urls
def generate_urls(prefix, suffix, r):
    if r == 0:
        return [prefix]
    
    urls = []
    for x in range(0, r):
        urls.append(prefix + str(x + 1) + suffix)

    return urls

In [4]:
def get_page(url, headers):
    resp = requests.get(url, headers=headers, timeout=None)
    soup = bs4.BeautifulSoup(resp.text, 'html5lib')
#     print(soup.prettify())
    return soup

In [32]:
def get_rows(page):
    if page.tbody:
        return page.tbody.find_all('tr')
    else:
        return []
    

In [6]:
def trim_rows(data, start, finish):
    return data[start:finish]

In [7]:
def save_data_to_csv(file, data):  
    for item in data:
        cols = item.find_all('td')
        row = []
        for col in cols:
            row.append(col.text)
        
        append_list_as_row(file, row)
        

In [39]:
headers = {
    'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/83.0.4103.97 Chrome/83.0.4103.97 Safari/537.36",
    'cookie': "__cfduid=d45fca5c2d9141aff89f4de19316344d21591611382; ASP.NET_SessionId=1aqgu2qpziantjigxlwtb4ay; _ga=GA1.2.811314642.1591611384; _gid=GA1.2.735645790.1591871016; _gat_gtag_UA_46998878_6=1"
}

In [9]:
def address_scraper(file, prefix, suffix, r, headers, urls=None):
    
    if not urls:   
        urls = generate_urls(prefix, suffix, r)

    for url in urls:
        print(f"Fetching {url}")
        page = get_page(url, headers)
        data = get_rows(page)
        save_data_to_csv(file, data)
        print("page written to file.")
        time.sleep(5)

    print("finished.")

In [10]:
# #Exchanges
# ex_file = "exchanges.csv"
# ex_base = "https://etherscan.io/accounts/label/exchange/"
# ex_end = "?ps=100"
# pages = 3
# address_scraper(ex_file, ex_base, ex_end, pages, headers)

In [11]:
# #Exchange tokens
# ext_file = "exchanges_tokens.csv"
# ext_base = "https://etherscan.io/tokens/label/exchange?ps=100&p="
# ext_end = ""
# pages = 2
# address_scraper(ext_file, ext_base, ext_end, pages, headers)

In [12]:
# #DEXs
# dex_file = "dex.csv"
# dex_base = "https://etherscan.io/accounts/label/dex?ps=100"

# address_scraper(dex_file, dex_base, "", 0, headers)

In [13]:
# #DEX Tokens
# dext_file = "dex_tokens.csv"
# dext_base = "https://etherscan.io/tokens/label/dex"
# address_scraper(dext_file, dext_base, "", 0, headers)

In [14]:
# #ICO Wallets
# icow_file = "ico_wallets.csv"

# icow_base = "https://etherscan.io/accounts/label/ico-wallets/"
# icow_end = "?ps=100"
# pages = 2
# address_scraper(icow_file, icow_base, icow_end, pages, headers)

In [15]:
# #Miners
# m_file = "miners.csv"

# m_base = "https://etherscan.io/accounts/label/mining?ps=100"

# address_scraper(m_file, m_base, "", 0, headers)

In [16]:
# #Miners Tokens
# mt_file = "miners_tokens.csv"
# mt_base = "https://etherscan.io/tokens/label/mining?ps=100"

# address_scraper(mt_file, mt_base, "", 0, headers)

In [17]:
# #Tokens
# t_file = "tokens.csv"
# t_base = "https://etherscan.io/accounts/label/token-contract/"
# t_end = "?ps=100"
# pages = 27
# address_scraper(t_file, t_base, t_end, pages, headers)

In [18]:
def str_parser(i):
    s = i.split(" ")
    word = s[0].lower()
    num = int(s[1][1:-1])
    return word, num

In [19]:
def scrape_categories(headers):
    s = get_page("https://etherscan.io/labelcloud", headers)
    x = s.find_all('div', class_='secondary-container')
    
    categories = []

    for i in x:
        sub_c = {}
        sub_c['name'] = i.find('div', class_="dropdown-menu")['aria-labelledby']
        sub_c['url'] = i.button['data-url']

        
        for index in range(0, len(i.find_all('a'))):
            text = i.find_all('a')[index].text
            word, num = str_parser(text)
            if word != 'transactions':
                sub_c[word] = num

#         first = i.find_all('a')[0].text
#         word, num = str_parser(first)
#         if word != 'transactions':
#             sub_c[word] = num
        
        
#         if len(i.find_all('a')) > 1:
#             second = i.find_all('a')[1].text 
#             word2, num2 = 
#             if second[0:2] != 'Tr':   
#                 sub_c[indexes[index*-1]] = str_parser(second)

        categories.append(sub_c)
        
    return categories

In [20]:
def split_categories(c):
    legit = []
    scams_and_hacks = []

    illegitimate_kw = ['hack', 'phish', 'scam', 'compromised', 'ponzi', 'exploit', 'trapped', 'down', 'unstoppable', 'suspicious', 'unsafe', 'spam']

    for item in c:
        for word in illegitimate_kw:
            if word in item['name'].lower().split(" "):
                scams_and_hacks.append(item)
                break
        else:
            legit.append(item)

    return legit, scams_and_hacks

In [21]:
def make_urls(typ, item):
    base = "https://etherscan.io/" + typ + "/label/" + item['url']
    urls = []
    
    if item[typ] < 100:
        base += "?ps=100"
        urls.append(base)
        
    else:
        urls = []
        pages = math.ceil(item[typ]/100)
        
        for i in range(0, pages):
            urls.append(base + '/' + str(i+1) + "?ps=100")
        
    return urls

In [22]:
def data_generator(l):
    data = []

    for item in l:

        for key in item.keys():    
            if key == 'name' or key == 'url':
                continue
            data_item = [item['name'], key]
            data_item.append(make_urls(key, item))

            data.append(data_item)
            
    return data

In [23]:
def mega_scraper(file, headers, d):
    for category in d:
        print(f"Fetching {category[0]} {category[1]}")
        for url in category[2]:
            print(f"Fetching {url}")
            page = get_page(url, headers)
            data = get_rows(page)
            save_data_to_csv2(file, data, category[1], category[0])
            print("page written to file.")
            time.sleep(5)

    print("finished.")

In [24]:
def save_data_to_csv2(file, data, typ, cat):  
    for item in data:
        if (len(item) <= 2):
            continue #No entries in table
            
        cols = item.find_all('td')
        
        row = []
        if typ == 'accounts': 
            row.append(cols[0].text)
            row.append(cols[1].text)
            row.append(typ)
            row.append(cat)
            row.append(cols[2].text)
            row.append(cols[3].text)
        else:
            row.append(cols[1].text)
            row.append(cols[2].text)
            row.append(typ)
            row.append(cat)
            row.append(cols[3].text)
            row.append(cols[4].text)
        
        append_list_as_row(file, row)

In [25]:
#set up files
legit_csv = "legit.csv"
write_list_as_row(legit_csv, [])
dodgy_csv = "dodgy.csv"
write_list_as_row(dodgy_csv, [])

In [26]:
c = scrape_categories(headers)
l, s = split_categories(c)

In [29]:
#For each category generate a list: [Category, type [account/token], urls]
data_legit = data_generator(l)
data_illegit = data_generator(s)
print(data_legit)

[['0x', 'accounts', ['https://etherscan.io/accounts/label/0x?ps=100']], ['0x', 'tokens', ['https://etherscan.io/tokens/label/0x?ps=100']], ['0x Ecosystem', 'accounts', ['https://etherscan.io/accounts/label/0x-ecosystem?ps=100']], ['0x Ecosystem', 'tokens', ['https://etherscan.io/tokens/label/0x-ecosystem?ps=100']], ['0xUniverse', 'accounts', ['https://etherscan.io/accounts/label/0xuniverse?ps=100']], ['0xUniverse', 'tokens', ['https://etherscan.io/tokens/label/0xuniverse?ps=100']], ['Aave', 'accounts', ['https://etherscan.io/accounts/label/aave?ps=100']], ['Aave', 'tokens', ['https://etherscan.io/tokens/label/aave?ps=100']], ['ABCC', 'accounts', ['https://etherscan.io/accounts/label/abcc?ps=100']], ['ABCC', 'tokens', ['https://etherscan.io/tokens/label/abcc?ps=100']], ['Adult', 'tokens', ['https://etherscan.io/tokens/label/adult?ps=100']], ['Advertising', 'accounts', ['https://etherscan.io/accounts/label/advertising?ps=100']], ['Advertising', 'tokens', ['https://etherscan.io/tokens/lab

In [41]:
# mega_scraper(dodgy_csv, headers, data_illegit)
mega_scraper(legit_csv, headers, data_legit)


Fetching 0x accounts
Fetching https://etherscan.io/accounts/label/0x?ps=100
page written to file.
Fetching 0x tokens
Fetching https://etherscan.io/tokens/label/0x?ps=100
page written to file.
Fetching 0x Ecosystem accounts
Fetching https://etherscan.io/accounts/label/0x-ecosystem?ps=100
page written to file.
Fetching 0x Ecosystem tokens
Fetching https://etherscan.io/tokens/label/0x-ecosystem?ps=100
page written to file.
Fetching 0xUniverse accounts
Fetching https://etherscan.io/accounts/label/0xuniverse?ps=100
page written to file.
Fetching 0xUniverse tokens
Fetching https://etherscan.io/tokens/label/0xuniverse?ps=100
page written to file.
Fetching Aave accounts
Fetching https://etherscan.io/accounts/label/aave?ps=100
page written to file.
Fetching Aave tokens
Fetching https://etherscan.io/tokens/label/aave?ps=100
page written to file.
Fetching ABCC accounts
Fetching https://etherscan.io/accounts/label/abcc?ps=100
page written to file.
Fetching ABCC tokens
Fetching https://etherscan.io

page written to file.
Fetching Bugs accounts
Fetching https://etherscan.io/accounts/label/bugs?ps=100
page written to file.
Fetching bZx accounts
Fetching https://etherscan.io/accounts/label/bzx?ps=100
page written to file.
Fetching bZx tokens
Fetching https://etherscan.io/tokens/label/bzx?ps=100
page written to file.
Fetching CACHE Gold accounts
Fetching https://etherscan.io/accounts/label/cache-gold?ps=100
page written to file.
Fetching CACHE Gold tokens
Fetching https://etherscan.io/tokens/label/cache-gold?ps=100
page written to file.
Fetching Celsius Network accounts
Fetching https://etherscan.io/accounts/label/celsius-network?ps=100
page written to file.
Fetching Celsius Network tokens
Fetching https://etherscan.io/tokens/label/celsius-network?ps=100
page written to file.
Fetching Chainlink accounts
Fetching https://etherscan.io/accounts/label/chainlink?ps=100
page written to file.
Fetching Charity accounts
Fetching https://etherscan.io/accounts/label/charity?ps=100
page written t

page written to file.
Fetching Decentraland accounts
Fetching https://etherscan.io/accounts/label/decentraland?ps=100
page written to file.
Fetching Decentraland tokens
Fetching https://etherscan.io/tokens/label/decentraland?ps=100
page written to file.
Fetching DeFi accounts
Fetching https://etherscan.io/accounts/label/defi?ps=100
page written to file.
Fetching DeFi tokens
Fetching https://etherscan.io/tokens/label/defi?ps=100
page written to file.
Fetching DeFi Saver accounts
Fetching https://etherscan.io/accounts/label/defi-saver?ps=100
page written to file.
Fetching Deflationary Token tokens
Fetching https://etherscan.io/tokens/label/deflationary-token?ps=100
page written to file.
Fetching Deprecated accounts
Fetching https://etherscan.io/accounts/label/deprecated?ps=100
page written to file.
Fetching Deprecated tokens
Fetching https://etherscan.io/tokens/label/deprecated?ps=100
page written to file.
Fetching Derivatives accounts
Fetching https://etherscan.io/accounts/label/derivat

page written to file.
Fetching Freelance tokens
Fetching https://etherscan.io/tokens/label/freelance?ps=100
page written to file.
Fetching FTX accounts
Fetching https://etherscan.io/accounts/label/ftx?ps=100
page written to file.
Fetching FTX tokens
Fetching https://etherscan.io/tokens/label/ftx?ps=100
page written to file.
Fetching Fund tokens
Fetching https://etherscan.io/tokens/label/fund?ps=100
page written to file.
Fetching Gambling accounts
Fetching https://etherscan.io/accounts/label/gambling?ps=100
page written to file.
Fetching Gambling tokens
Fetching https://etherscan.io/tokens/label/gambling?ps=100
page written to file.
Fetching Gaming accounts
Fetching https://etherscan.io/accounts/label/gaming?ps=100
page written to file.
Fetching Gaming tokens
Fetching https://etherscan.io/tokens/label/gaming/1?ps=100
page written to file.
Fetching https://etherscan.io/tokens/label/gaming/2?ps=100
page written to file.
Fetching Gas tokens
Fetching https://etherscan.io/tokens/label/gas?ps

page written to file.
Fetching Loans tokens
Fetching https://etherscan.io/tokens/label/loans?ps=100
page written to file.
Fetching Logistics tokens
Fetching https://etherscan.io/tokens/label/logistics?ps=100
page written to file.
Fetching Loopring accounts
Fetching https://etherscan.io/accounts/label/loopring?ps=100
page written to file.
Fetching Loopring tokens
Fetching https://etherscan.io/tokens/label/loopring?ps=100
page written to file.
Fetching LORDLESS accounts
Fetching https://etherscan.io/accounts/label/lordless?ps=100
page written to file.
Fetching LORDLESS tokens
Fetching https://etherscan.io/tokens/label/lordless?ps=100
page written to file.
Fetching Loyalty and Rewards tokens
Fetching https://etherscan.io/tokens/label/loyalty-and-rewards?ps=100
page written to file.
Fetching LUKSO Blockchain accounts
Fetching https://etherscan.io/accounts/label/lukso-blockchain?ps=100
page written to file.
Fetching LUKSO Blockchain tokens
Fetching https://etherscan.io/tokens/label/lukso-bl

page written to file.
Fetching Orchid tokens
Fetching https://etherscan.io/tokens/label/orchid?ps=100
page written to file.
Fetching Origin Protocol accounts
Fetching https://etherscan.io/accounts/label/origin-protocol?ps=100
page written to file.
Fetching Origin Protocol tokens
Fetching https://etherscan.io/tokens/label/origin-protocol?ps=100
page written to file.
Fetching Origo accounts
Fetching https://etherscan.io/accounts/label/origo?ps=100
page written to file.
Fetching Origo tokens
Fetching https://etherscan.io/tokens/label/origo?ps=100
page written to file.
Fetching OTC accounts
Fetching https://etherscan.io/accounts/label/otc?ps=100
page written to file.
Fetching OTC tokens
Fetching https://etherscan.io/tokens/label/otc?ps=100
page written to file.
Fetching Others tokens
Fetching https://etherscan.io/tokens/label/others?ps=100
page written to file.
Fetching Paribu accounts
Fetching https://etherscan.io/accounts/label/paribu?ps=100
page written to file.
Fetching Parity Bug acco

page written to file.
Fetching Smart Contract accounts
Fetching https://etherscan.io/accounts/label/smart-contract?ps=100
page written to file.
Fetching Smart Contract tokens
Fetching https://etherscan.io/tokens/label/smart-contract?ps=100
page written to file.
Fetching Social tokens
Fetching https://etherscan.io/tokens/label/social?ps=100
page written to file.
Fetching Social Media tokens
Fetching https://etherscan.io/tokens/label/social-media?ps=100
page written to file.
Fetching Software tokens
Fetching https://etherscan.io/tokens/label/software?ps=100
page written to file.
Fetching Sports tokens
Fetching https://etherscan.io/tokens/label/sports?ps=100
page written to file.
Fetching ST-20 tokens
Fetching https://etherscan.io/tokens/label/st-20?ps=100
page written to file.
Fetching Stablecoin tokens
Fetching https://etherscan.io/tokens/label/stablecoin?ps=100
page written to file.
Fetching Staking accounts
Fetching https://etherscan.io/accounts/label/staking?ps=100
page written to fi

page written to file.
Fetching TrustToken accounts
Fetching https://etherscan.io/accounts/label/trusttoken?ps=100
page written to file.
Fetching TrustToken tokens
Fetching https://etherscan.io/tokens/label/trusttoken?ps=100
page written to file.
Fetching Uniswap accounts
Fetching https://etherscan.io/accounts/label/uniswap/1?ps=100
page written to file.
Fetching https://etherscan.io/accounts/label/uniswap/2?ps=100
page written to file.
Fetching Upbit accounts
Fetching https://etherscan.io/accounts/label/upbit?ps=100
page written to file.
Fetching Vehicle tokens
Fetching https://etherscan.io/tokens/label/vehicle?ps=100
page written to file.
Fetching Video tokens
Fetching https://etherscan.io/tokens/label/video?ps=100
page written to file.
Fetching Vodi X accounts
Fetching https://etherscan.io/accounts/label/vodi-x?ps=100
page written to file.
Fetching Vodi X tokens
Fetching https://etherscan.io/tokens/label/vodi-x?ps=100
page written to file.
Fetching VPN tokens
Fetching https://ethersc

In [40]:
legit_csv = "legit.csv"
write_list_as_row(legit_csv, [])