#### Import Dependencies

In [None]:
import requests
from bs4 import BeautifulSoup
import json, os
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

#### Collect Crypto Data
- Get data for top 5k crypto by marketcap from coinmarketcap.com
- Save data to a file (Prevent usage of Api limit)

In [None]:
def get_top_5k_cryptos() -> pd.DataFrame:
    
    api_key = '1d13c476-343c-4717-b648-8a4a1b939b42'
    url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'

    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': api_key,
    }
    
    params = {
        'start': '1',
        'limit': '5000',  # You can adjust this to get more or fewer listings
        'convert': 'USD',
        'sort': 'market_cap'
    }
    
    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    if response.status_code == 200:
        df = pd.DataFrame(data['data'])
        return df
        print(f'Data Saved..')
    else:
        print(f"Error {response.status_code}: {data['status']['error_message']}")
        
# df = get_top_5k_cryptos()
# df.to_csv('../data/topk_crypto.csv', index=False)
# print('Collected top 5k crypto.')

#### Load the dataset

In [19]:
k = 1000
df = pd.read_csv('../data/topk_crypto.csv').head(k)
print(f'Dimensions of the sataset: {df.shape}')
display(df.sample(3))


Dimensions of the sataset: (1000, 18)


Unnamed: 0,id,name,symbol,slug,num_market_pairs,date_added,tags,max_supply,circulating_supply,total_supply,infinite_supply,platform,cmc_rank,self_reported_circulating_supply,self_reported_market_cap,tvl_ratio,last_updated,quote
933,18662,Galeon,GALEON,galeon,17,2022-03-09T08:08:43.000Z,"['health', 'desci']",4000000000.0,689820500.0,2520083000.0,False,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ET...",934,689820500.0,20154840.0,,2024-06-06T04:13:00.000Z,"{'USD': {'price': 0.02921751764677821, 'volume..."
742,2866,Sentinel Protocol,UPP,sentinel-protocol,5,2018-06-18T00:00:00.000Z,"['ai-big-data', 'dwf-labs-portfolio']",,497834800.0,500000000.0,False,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ET...",743,,,,2024-06-06T04:12:00.000Z,"{'USD': {'price': 0.07449981615037422, 'volume..."
634,5024,ALL BEST ICO,ALLBI,all-best-ico,15,2019-12-10T00:00:00.000Z,['solana-ecosystem'],,1330205000.0,1534000000.0,False,"{'id': 5426, 'name': 'Solana', 'symbol': 'SOL'...",635,,,,2024-06-06T04:12:00.000Z,"{'USD': {'price': 0.04129149576958077, 'volume..."


#### Get whitepaper links for each of the topk cryptos

In [20]:
def get_whitepaper_link(slug):
    url = f'https://coinmarketcap.com/currencies/{slug}/'
    response = requests.get(url)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    divs = soup.find_all('div', class_='sc-d1ede7e3-0 sc-7f0f401-0 gRSwoF gQoblf')
    
    for div in divs:
        if 'whitepaper' not in div.get_text().lower():
            continue
        
        # Find the whitepaper link within this div
        for a_tag in div.find_all('a', href=True):
            return a_tag['href']
    
    return None

# tqdm.pandas()
# df['whitepaper_link'] = df['slug'].progress_apply(get_whitepaper_link)
# df.to_csv(f'../data/top{k}_crypto_w_whitepaper_link.csv', index=False)


100%|██████████| 1000/1000 [03:30<00:00,  4.75it/s]


#### Loading dataset with whitepaper links

In [25]:
df = pd.read_csv('../data/top1000_crypto_w_whitepaper_link.csv')
print(f'Shape of dataframe: ', df.shape)
print(f'Number of whitepaper links: ', df.whitepaper_link.nunique())
display(df.head(2))

Shape of dataframe:  (1000, 19)
Number of whitepaper links:  413


Unnamed: 0,id,name,symbol,slug,num_market_pairs,date_added,tags,max_supply,circulating_supply,total_supply,infinite_supply,platform,cmc_rank,self_reported_circulating_supply,self_reported_market_cap,tvl_ratio,last_updated,quote,whitepaper_link
0,1,Bitcoin,BTC,bitcoin,11101,2010-07-13T00:00:00.000Z,"['mineable', 'pow', 'sha-256', 'store-of-value...",21000000.0,19708080.0,19708080.0,False,,1,,,,2024-06-06T04:12:00.000Z,"{'USD': {'price': 70980.39962768357, 'volume_2...",https://bitcoin.org/bitcoin.pdf
1,1027,Ethereum,ETH,ethereum,9038,2015-08-07T00:00:00.000Z,"['pos', 'smart-contracts', 'ethereum-ecosystem...",,120149100.0,120149100.0,True,,2,,,,2024-06-06T04:12:00.000Z,"{'USD': {'price': 3859.267132868896, 'volume_2...",https://github.com/ethereum/wiki/wiki/White-Paper
