#### Import Dependencies

In [1]:
import requests
from bs4 import BeautifulSoup
import json, os
import pandas as pd
from tqdm import tqdm
import traceback
import PyPDF2

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None

#### Collect Crypto Data
- Get data for top 5k crypto by marketcap from coinmarketcap.com
- Save data to a file (Prevent usage of Api limit)

In [2]:
def get_top_5k_cryptos() -> pd.DataFrame:
    
    api_key = '1d13c476-343c-4717-b648-8a4a1b939b42'
    url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'

    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': api_key,
    }
    
    params = {
        'start': '1',
        'limit': '5000',  # You can adjust this to get more or fewer listings
        'convert': 'USD',
        'sort': 'market_cap'
    }
    
    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    if response.status_code == 200:
        df = pd.DataFrame(data['data'])
        return df
        print(f'Data Saved..')
    else:
        print(f"Error {response.status_code}: {data['status']['error_message']}")
        
# df = get_top_5k_cryptos()
# df.to_csv('../data/topk_crypto.csv', index=False)
# print('Collected top 5k crypto.')

#### Load the dataset

In [3]:
k = 1000
df = pd.read_csv('../data/topk_crypto.csv').head(k)
print(f'Dimensions of the sataset: {df.shape}')
display(df.sample(1))


Dimensions of the sataset: (1000, 18)


Unnamed: 0,id,name,symbol,slug,num_market_pairs,date_added,tags,max_supply,circulating_supply,total_supply,infinite_supply,platform,cmc_rank,self_reported_circulating_supply,self_reported_market_cap,tvl_ratio,last_updated,quote
547,5821,Aleph.im,ALEPH,aleph-im,53,2020-07-08T00:00:00.000Z,"['mineable', 'cosmos-ecosystem', 'ai-big-data', 'distributed-computing', 'filesharing', 'storage', 'polkadot-ecosystem', 'avalanche-ecosystem', 'solana-ecosystem', 'injective-ecosystem', 'bnb-chain', 'ftx-bankruptcy-estate', 'depin']",,247220500.0,500000000.0,False,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ETH', 'slug': 'ethereum', 'token_address': '0x27702a26126e0b3702af63ee09ac4d1a084ef628'}",548,45968548.0,13725160.0,,2024-06-06T04:12:00.000Z,"{'USD': {'price': 0.2985771825407063, 'volume_24h': 958335.41423686, 'volume_change_24h': -10.6523, 'percent_change_1h': -0.03526277, 'percent_change_24h': -1.10923119, 'percent_change_7d': -6.25369944, 'percent_change_30d': -19.89867374, 'percent_change_60d': 52.17284833, 'percent_change_90d': -19.10340228, 'market_cap': 73814395.02515341, 'market_cap_dominance': 0.0028, 'fully_diluted_market_cap': 149288591.27, 'tvl': None, 'last_updated': '2024-06-06T04:12:00.000Z'}}"


#### Get whitepaper links for each of the topk cryptos

In [4]:
def get_whitepaper_link(slug):
    url = f'https://coinmarketcap.com/currencies/{slug}/'
    response = requests.get(url)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    divs = soup.find_all('div', class_='sc-d1ede7e3-0 sc-7f0f401-0 gRSwoF gQoblf')
    
    for div in divs:
        if 'whitepaper' not in div.get_text().lower():
            continue
        
        # Find the whitepaper link within this div
        for a_tag in div.find_all('a', href=True):
            return a_tag['href']
    
    return None

# tqdm.pandas()
# df['whitepaper_link'] = df['slug'].progress_apply(get_whitepaper_link)
# df.to_csv(f'../data/top{k}_crypto_w_whitepaper_link.csv', index=False)


#### Loading dataset with whitepaper links

In [5]:
df = pd.read_csv('../data/top1000_crypto_w_whitepaper_link.csv')
df = df.drop(columns = ['tags', 'quote'])
df = df[df['whitepaper_link'].notna()]
print(f'Shape of dataframe: ', df.shape)
print(f'Number of whitepaper links: ', df.whitepaper_link.nunique())
display(df.head(5))

Shape of dataframe:  (423, 17)
Number of whitepaper links:  413


Unnamed: 0,id,name,symbol,slug,num_market_pairs,date_added,max_supply,circulating_supply,total_supply,infinite_supply,platform,cmc_rank,self_reported_circulating_supply,self_reported_market_cap,tvl_ratio,last_updated,whitepaper_link
0,1,Bitcoin,BTC,bitcoin,11101,2010-07-13T00:00:00.000Z,21000000.0,19708080.0,19708080.0,False,,1,,,,2024-06-06T04:12:00.000Z,https://bitcoin.org/bitcoin.pdf
1,1027,Ethereum,ETH,ethereum,9038,2015-08-07T00:00:00.000Z,,120149100.0,120149100.0,True,,2,,,,2024-06-06T04:12:00.000Z,https://github.com/ethereum/wiki/wiki/White-Paper
2,825,Tether USDt,USDT,tether,87800,2015-02-25T00:00:00.000Z,,112392100000.0,115086100000.0,True,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ETH', 'slug': 'ethereum', 'token_address': '0xdac17f958d2ee523a2206206994597c13d831ec7'}",3,,,,2024-06-06T04:12:00.000Z,https://tether.to/wp-content/uploads/2016/06/TetherWhitePaper.pdf
4,5426,Solana,SOL,solana,674,2020-04-10T00:00:00.000Z,,459920200.0,577565700.0,True,,5,,,,2024-06-06T04:12:00.000Z,https://solana.com/solana-whitepaper.pdf
5,3408,USDC,USDC,usd-coin,19594,2018-10-08T00:00:00.000Z,,32601130000.0,32601130000.0,False,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ETH', 'slug': 'ethereum', 'token_address': '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48'}",6,,,,2024-06-06T04:12:00.000Z,https://f.hubspotusercontent30.net/hubfs/9304636/PDF/centre-whitepaper.pdf


#### Download pdfs

In [6]:
# Function to download PDF
def download_pdf(url, file_name):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(file_name, 'wb') as file:
                file.write(response.content)
                
            if not verify_pdf(file_name):
                invalid_file_name = file_name.replace('.pdf', '_invalid.pdf')
                os.rename(file_name, invalid_file_name)
            
        else:
            print(f'Failed to download {file_name}, with url: {url}')
    except Exception as e:
        print(f'An error occurred while downloading {file_name}, {e}')
        # traceback.print_exc()
        
# Function to verify PDF structure
def verify_pdf(file_name):
    try:
        with open(file_name, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            # Check if the PDF has at least one page
            if len(reader.pages) > 0:
                return True
            else:
                return False
    except Exception as e:
        print(f'An error occurred while verifying {file_name}. Error: {e}')
        # traceback.print_exc()
        return False
        

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    url = row['whitepaper_link']
    if '.pdf' not in url:
        continue
    file_name = f"../data/whitepapers/{index + 1}_{row['slug']}.pdf"
    if not os.path.exists(file_name):
        download_pdf(url, file_name)
    
    

  1%|          | 4/423 [00:01<02:20,  2.98it/s]

An error occurred while verifying ../data/whitepapers/3_tether.pdf. Error: EOF marker not found


  7%|▋         | 31/423 [00:04<00:35, 10.92it/s]

An error occurred while verifying ../data/whitepapers/38_stellar.pdf. Error: EOF marker not found


  8%|▊         | 33/423 [00:05<00:41,  9.50it/s]

An error occurred while verifying ../data/whitepapers/39_monero.pdf. Error: EOF marker not found


 12%|█▏        | 51/423 [00:11<03:09,  1.96it/s]

An error occurred while verifying ../data/whitepapers/61_fetch.pdf. Error: EOF marker not found


 13%|█▎        | 54/423 [00:12<02:30,  2.45it/s]

An error occurred while verifying ../data/whitepapers/66_aave.pdf. Error: EOF marker not found


 13%|█▎        | 55/423 [00:12<02:34,  2.39it/s]

An error occurred while verifying ../data/whitepapers/67_sei.pdf. Error: EOF marker not found


 16%|█▌        | 68/423 [00:16<02:24,  2.46it/s]

An error occurred while verifying ../data/whitepapers/87_the-sandbox.pdf. Error: EOF marker not found


 18%|█▊        | 78/423 [00:19<02:00,  2.85it/s]

An error occurred while verifying ../data/whitepapers/100_gnosis-gno.pdf. Error: EOF marker not found


 19%|█▉        | 81/423 [00:19<01:37,  3.52it/s]

An error occurred while verifying ../data/whitepapers/104_nexo.pdf. Error: EOF marker not found


 21%|██        | 87/423 [00:20<00:58,  5.77it/s]

An error occurred while verifying ../data/whitepapers/112_dexe.pdf. Error: EOF marker not found


 21%|██        | 88/423 [00:21<01:17,  4.30it/s]

An error occurred while verifying ../data/whitepapers/113_iota.pdf. Error: EOF marker not found


 22%|██▏       | 91/423 [00:21<00:59,  5.55it/s]

Failed to download ../data/whitepapers/116_usdd.pdf, with url: https://usdd.network/USDD-en.pdf


 22%|██▏       | 93/423 [00:21<01:08,  4.84it/s]

Failed to download ../data/whitepapers/118_axelar.pdf, with url: https://axelar.network/wp-content/uploads/2021/07/axelar_whitepaper.pdf


 27%|██▋       | 116/423 [00:23<00:28, 10.93it/s]

Failed to download ../data/whitepapers/152_raydium.pdf, with url: https://raydium.io/Raydium-Litepaper.pdf
Failed to download ../data/whitepapers/153_0x.pdf, with url: https://0xproject.com/pdfs/0x_white_paper.pdf


 30%|███       | 129/423 [00:25<00:29,  9.95it/s]

An error occurred while verifying ../data/whitepapers/166_zcash.pdf. Error: EOF marker not found


 33%|███▎      | 140/423 [00:26<00:24, 11.34it/s]

An error occurred while verifying ../data/whitepapers/181_arcblock.pdf. Error: EOF marker not found


 37%|███▋      | 157/423 [00:31<00:30,  8.79it/s]

Failed to download ../data/whitepapers/200_centrifuge.pdf, with url: https://centrifuge.io/cfg_token_summary.pdf


 45%|████▌     | 191/423 [03:26<15:44,  4.07s/it]  

Failed to download ../data/whitepapers/257_telcoin.pdf, with url: https://www.telco.in/docs/whitepaper.pdf


 48%|████▊     | 202/423 [03:28<05:58,  1.62s/it]

Failed to download ../data/whitepapers/273_band-protocol.pdf, with url: https://bandprotocol.com/whitepaper-3.0.1.pdf


 49%|████▉     | 208/423 [03:31<04:08,  1.16s/it]

Failed to download ../data/whitepapers/279_rlc.pdf, with url: https://iex.ec/app/uploads/2017/04/iExec-WPv2.0-English.pdf


 52%|█████▏    | 219/423 [03:32<01:27,  2.32it/s]

Failed to download ../data/whitepapers/294_beldex.pdf, with url: https://beldex.io/whitepaper.pdf


 55%|█████▍    | 232/423 [03:35<00:54,  3.51it/s]

Failed to download ../data/whitepapers/308_prom.pdf, with url: https://prom.io/whitepaper.pdf


 57%|█████▋    | 241/423 [03:35<00:30,  5.90it/s]

An error occurred while verifying ../data/whitepapers/317_digibyte.pdf. Error: EOF marker not found
Failed to download ../data/whitepapers/318_propy.pdf, with url: https://tokensale.propy.com/Propy-White-Paper-17-Jul-2017.pdf


 58%|█████▊    | 244/423 [03:36<00:25,  6.90it/s]

An error occurred while downloading ../data/whitepapers/326_wavax.pdf, HTTPSConnectionPool(host='files.avalabs.org', port=443): Max retries exceeded with url: /papers/consensus.pdf (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x120069900>: Failed to resolve 'files.avalabs.org' ([Errno 8] nodename nor servname provided, or not known)"))


 60%|█████▉    | 252/423 [03:37<00:25,  6.81it/s]

Failed to download ../data/whitepapers/336_venus.pdf, with url: https://venus.io/Whitepaper.pdf


 60%|██████    | 255/423 [03:38<00:25,  6.70it/s]

An error occurred while downloading ../data/whitepapers/342_flex.pdf, HTTPSConnectionPool(host='coinflex.com', port=443): Max retries exceeded with url: /documents/CoinFLEX-Whitepaper.pdf (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x12006ac20>: Failed to resolve 'coinflex.com' ([Errno 8] nodename nor servname provided, or not known)"))


 64%|██████▎   | 269/423 [03:39<00:18,  8.46it/s]

Failed to download ../data/whitepapers/363_world-mobile-token.pdf, with url: https://worldmobiletoken.com/WhitePaper.pdf


 64%|██████▍   | 271/423 [03:40<00:27,  5.46it/s]

An error occurred while verifying ../data/whitepapers/365_vulcan-forged-pyr.pdf. Error: EOF marker not found


 68%|██████▊   | 286/423 [03:42<00:15,  8.79it/s]

An error occurred while verifying ../data/whitepapers/385_paxos-standard.pdf. Error: EOF marker not found


 69%|██████▊   | 290/423 [03:42<00:13,  9.83it/s]

Failed to download ../data/whitepapers/389_stratis-new.pdf, with url: https://www.stratisplatform.com/files/Stratis_Whitepaper.pdf


 69%|██████▉   | 292/423 [03:43<00:18,  7.13it/s]

Failed to download ../data/whitepapers/390_ark.pdf, with url: https://arkscic.com/Whitepaper.pdf


 70%|██████▉   | 294/423 [03:43<00:18,  6.84it/s]

An error occurred while verifying ../data/whitepapers/396_pundix-new.pdf. Error: EOF marker not found


 70%|███████   | 298/423 [03:43<00:16,  7.71it/s]

An error occurred while verifying ../data/whitepapers/403_status.pdf. Error: EOF marker not found


 71%|███████   | 301/423 [03:44<00:18,  6.58it/s]

Failed to download ../data/whitepapers/407_velo.pdf, with url: https://velo.org/doc/Velo_Whitepaper_EN.pdf


 72%|███████▏  | 304/423 [03:45<00:27,  4.34it/s]

An error occurred while verifying ../data/whitepapers/411_stargate-finance.pdf. Error: EOF marker not found


 76%|███████▌  | 321/423 [03:47<00:11,  9.13it/s]

An error occurred while verifying ../data/whitepapers/433_wink.pdf. Error: EOF marker not found


 78%|███████▊  | 328/423 [03:50<00:19,  4.84it/s]

An error occurred while verifying ../data/whitepapers/442_bazaars.pdf. Error: EOF marker not found


 80%|███████▉  | 338/423 [03:52<00:17,  4.73it/s]

Failed to download ../data/whitepapers/454_dkargo.pdf, with url: https://dkargo.io/resources/dkargo_wp_en_new.pdf
Failed to download ../data/whitepapers/460_mines-of-dalarnia.pdf, with url: https://www.minesofdalarnia.com/assets/MoD-Litepaper-updated-27-Oct.pdf


 80%|████████  | 340/423 [03:52<00:16,  5.06it/s]

An error occurred while verifying ../data/whitepapers/461_orchid.pdf. Error: EOF marker not found
An error occurred while verifying ../data/whitepapers/463_bluzelle.pdf. Error: EOF marker not found


 84%|████████▍ | 356/423 [03:54<00:08,  8.17it/s]

Failed to download ../data/whitepapers/484_medibloc.pdf, with url: https://medibloc-homepage.oss-us-west-1.aliyuncs.com/whitepaper/medibloc_whitepaper_en.pdf


 87%|████████▋ | 367/423 [03:58<00:12,  4.39it/s]

Failed to download ../data/whitepapers/502_energy-web-token.pdf, with url: https://www.energyweb.org/wp-content/uploads/2019/12/EnergyWeb-EWDOS-VisionPurpose-vFinal-20191211.pdf


 90%|█████████ | 382/423 [04:00<00:10,  3.76it/s]

Failed to download ../data/whitepapers/520_veruscoin.pdf, with url: https://veruscoin.io/downloads/papers/VerusVision.pdf


 93%|█████████▎| 392/423 [04:02<00:05,  5.70it/s]

Failed to download ../data/whitepapers/530_ren.pdf, with url: https://renproject.io/litepaper.pdf


 94%|█████████▍| 398/423 [05:44<03:54,  9.39s/it]

An error occurred while verifying ../data/whitepapers/543_memetoon.pdf. Error: EOF marker not found


100%|██████████| 423/423 [05:52<00:00,  1.20it/s]
