In [1]:
# !pip install undetected-chromedriver
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import time
import re
import undetected_chromedriver as uc
import pandas as pd
import clipboard as c

In [19]:
def fetch_profile_links(file, listing_url, scroll_times, save=True, verbose=True):
    urls = []
    df = pd.read_csv(file)
    driver = uc.Chrome(use_subprocess=True)
    driver.get(listing_url)
    profile_class = "styles__StyledLink-sc-l6elh8-0 ikuMIO Blockreact__Block-sc-1xf18x6-0 kdnPIp AccountLink--ellipsis-overflow"
    for i in range(scroll_times):
        driver.execute_script("window.scrollBy(0, 1000)")
        page_source = driver.page_source
        time.sleep(3)
        soup = BeautifulSoup(page_source)
        user_tags = soup.find_all("a", {"class": profile_class})
        urls.extend([f"{domain}/{i.get('href')}" for i in user_tags])
    new_urls = set(urls) - set(df.url.to_list())
    new_data = {"url": list(new_urls), "processed": ["no"]*len(new_urls)}
    new_df = pd.DataFrame(new_data)
    new_df = pd.concat([df, new_df]).reset_index(drop=True)
    if verbose:
        print(f"Total new unique fetched Urls: {len(set(urls))}\n",
              "Processing Summary:\n",
              f"\tTotal Urls: {len(new_df)}\n",
             f"\tProcessed Urls: {len(new_df[new_df.processed == 'yes'])}\n",
             f"\tUnprocessed Urls: {len(new_df[new_df.processed == 'no'])}\n")
    if save:
        new_df.to_csv(file, index=False)
    driver.close()
    return new_df


def fetch_profile_detail(driver, url, req_social):
    entities = []
    detail = {}
    driver.get(url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    detail['profile_url'] = url
    detail['name'] = soup.find("div", {"class": "Overflowreact__OverflowContainer-sc-7qr9y8-0 jPSCbX AccountHeader--title"}).text
    info = soup.find_all("li", {"class": "Menureact__StyledListMenuItem-sc-1j0z9gq-3 hlufrI"})
    social_media_tag = soup.find_all("a", {"class": "styles__StyledLink-sc-l6elh8-0 ekTmzq Blockreact__Block-sc-1xf18x6-0 Buttonreact__StyledButton-sc-glfma3-0 kXZare kdWcfm ButtonGroupreact__StyledButton-sc-1skvztv-0 eztnHW AccountLinksBar--icon-button"})
    driver.find_element_by_xpath('//*[@id="main"]/div/div/div[1]/div[3]/div[3]/div/button').click() # To get address in clipboard
    time.sleep(1)
    detail["address"] = c.paste()
    
    if len(info) > 3:
        info = info[:3]
    for i in info:
        span = i.find_all('span')
        for s in span:
            entity = s.text
            if entity is not None:
                entities.append(entity)
        if len(span) < 2:
            entities.append("0")
    entities = dict(zip(entities[::2], entities[1::2]))
    detail.update(entities)
                    
    for tag in social_media_tag:
        url = tag.get("href")
        if url is not None:
            url_spl = re.split("/|.com", url)
            if len(url_spl) >= 2:
                domain = url_spl[2]
                if domain in req_social:
                    detail[domain] = url
                    
    return detail


def iterator(url_file, n_process, req_social, save=True, verbose=False):
    url_df = pd.read_csv(url_file)
    data = []
    driver = uc.Chrome(use_subprocess=True)
    count = 0
    try:
        for i, v in url_df.iterrows():
            if (count < n_process) and (v.processed == "no"):
                count += 1 
                url = v.url
                if verbose: print(count, "/", n_process)
                detail = fetch_profile_detail(driver, url, req_social)
                data.append(detail)
                url_df.loc[i, "processed"] = "yes"    
    except:
        if verbose: print(f"Error at url {i} i-e, {url}")
    df = pd.DataFrame(data)
    if save:
        current_datetime = datetime.now().strftime('%Y%m%d%H%M%S')
        url_df.to_csv(file, index=False)
        df.to_csv(f"Scrapped-Data-{current_datetime}.csv", index=False)
    if verbose:
        print("Processing Summary:\n",
              f"\tTotal Urls: {len(url_df)}\n",
             f"\tProcessed Urls: {len(url_df[url_df.processed == 'yes'])}\n",
             f"\tUnprocessed Urls: {len(url_df[url_df.processed == 'no'])}\n")
#     driver.close()
    return df
        

# Website detail

In [3]:
domain = 'https://opensea.io'
listing_url = domain + "//" + "activity"

# Extract unique profiles

In [4]:
file = "Scrapped-Profiles.csv"
urls_df = fetch_profile_links(file, listing_url, scroll_times=30, save=True, verbose=True)

Total new unique fetched Urls: 45
 Processing Summary:
 	Total Urls: 343
 	Processed Urls: 3
 	Unprocessed Urls: 340



# Extract profile data

In [20]:
req_social = ["twitter", "instagram"]
url_file = "Scrapped-Profiles.csv"
n_process = 4
df = iterator(url_file, n_process, req_social, save=True, verbose=True)

1 / 4




2 / 4
3 / 4
4 / 4
Processing Summary:
 	Total Urls: 343
 	Processed Urls: 7
 	Unprocessed Urls: 336



In [21]:
df

Unnamed: 0,profile_url,name,address,Collected,Created,Favorited,twitter
0,https://opensea.io//0xDD66C65f4aB47fD904850b93...,Unnamed,0xDD66C65f4aB47fD904850b9342236aDed3813Ec3,13,0,0,
1,https://opensea.io//0x2461,0x2461,verbose=True,4,0,35,
2,https://opensea.io//Mrjosco_Vault,Mrjosco_Vault,0xA90E6Ff084580B1194f7333a702De7641E33DFb9,18,0,2,https://twitter.com/@mrjosco
3,https://opensea.io//0x8DCeeb78462b002d71526ad0...,Unnamed,0x8DCeeb78462b002d71526ad0CeFB68bC3B001367,13,0,0,
