# Install Dependencies

In [48]:
import dotenv
import os
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from tqdm import tqdm
from pprint import pprint

dotenv.load_dotenv()
CMC_API_KEY = os.getenv("API_KEY") # get CMC api key from .env

# Get Top N Cryptocurrencies from CoinMarketCap API

Store it in a csv to bypass api limits


In [None]:
def get_top_n_currencies(n) -> pd.DataFrame:

    url = "https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest"
    parameters = {
        'start':'1',
        'limit':str(n),
        'convert':'USD'
    }
    headers = {
      'Accepts': 'application/json',
      'X-CMC_PRO_API_KEY': CMC_API_KEY,
    }

    response = requests.get(url, params=parameters, headers=headers)
    data = response.json()

    # check if valid response
    if (response.status_code != 200):
        print(f"Error! Status code: {response.status_code}. {data['status']['error_message']}")
    else:
        ## store response data in a DataFrame
        df = pd.DataFrame(data['data'])
        return df

# ## Call function and store DataFrame into a csv file -> only once!
# df = get_top_n_currencies(5000)
# df.to_csv('data/output.csv', index=True)


# Got the top n crypto -> now try and use beautifulsoup to scrape whitepaper links for each

In [77]:
def get_whitepaper_links(crypto_name):

    # make a GET request to fetch html data for the crypto
    url = f'https://coinmarketcap.com/currencies/{crypto_name}/'
    headers = { 'User-Agent' : 'Mozilla/5.0' }
    response = requests.get(url, headers=headers)

    if (response.status_code != 200):
        print(f"Error connecting to CoinMarketCap. Status code: {response.status_code}. Error Description: {response.reason}")
        return

    body = response.text
    
    # use BeautifulSoup to parse through response
    soup = BeautifulSoup(body, 'html.parser')

    divs = soup.find_all('div', class_='sc-d1ede7e3-0 sc-7f0f401-0 gRSwoF gQoblf')
    for div in divs:
        if 'whitepaper' not in div.get_text().lower():
            continue
        for a_tag in div.find_all('a', href=True):
            return a_tag['href']

# iterate through earlier csv and get names of top n cryptos
# tqdm.pandas()
# df = pd.read_csv('data/output.csv').head(5) # for testing reason just use 5 for now
# df['whitepaper_link'] = df['slug'].progress_apply(get_whitepaper_links)

# df.to_csv('data/output_with_links.csv', index=False)

    

# Try using a different website to get crypto whitepapers

In [66]:
def get_whitepapers_v2(slug):
    url = f'https://whitepaper.io/coin/{slug}'
    headers = {'User-Agent' : 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error connecting to whitepaper.io for currency {slug}. Error code: {response.status_code}. Error description: {response.reason}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    script_tag = soup.find('script', id='__NEXT_DATA__')
    json_data = json.loads(script_tag.string)

    whitepaper_link = json_data['props']['pageProps']['documents'][0]['source']

    return whitepaper_link


## TESTING
# get_whitepapers_v2('bitcoin')

# tqdm.pandas()
# df = pd.read_csv('data/output.csv').head(5) # for testing reason just use 5 for now
# df['whitepaper_link'] = df['slug'].progress_apply(get_whitepapers_v2)

# df.to_csv('data/output_with_links.csv', index=False)

# Version Three -> get the whitepaper.io site which houses the PDF

In [97]:
def get_whitepapers_v3(slug):
    url = f'https://whitepaper.io/coin/{slug}'
    headers = {'User-Agent' : 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        if response.status_code == 410:
            return
        print(f"Error connecting to whitepaper.io for currency {slug}. Error code: {response.status_code}. Error description: {response.reason}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    divs = soup.find_all('div', class_='hidden')
    for div in divs:

        a_tag = div.find('a')

        if a_tag:
            if 'pdf' in a_tag.get_text().lower():
                return a_tag['href']

### TESTING
# get_whitepapers_v3('bitcoin')

# tqdm.pandas()
# number_of_cryptos = 100
# df = pd.read_csv('data/output.csv').head(number_of_cryptos)
# df['whitepaper_link'] = df['slug'].progress_apply(get_whitepapers_v3)

# df.to_csv('data/output_with_links.csv', index=False)

In [86]:
print(f"Dimensions of DataFrame: {df.shape}")
print(f"Number of whitepaper links extracted out of {number_of_cryptos}: {df['whitepaper_link'].count()}")

Dimensions of DataFrame: (100, 20)
Number of whitepaper links extracted out of 100: 41


### Check if there is overlap between data set 1 and 2

In [91]:
df['og_whitepaper_link'] = df['slug'].progress_apply(get_whitepaper_links)

df['combined_links'] = df['whitepaper_link'].combine_first(df['og_whitepaper_link'])  # combine new whitepaper links with oiginal, prioritizing the new ones

# write the df to the csv
df.to_csv('data/output_with_links.csv', index=False)


100%|███████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.26it/s]


In [95]:
print(f'Dimensions of new DataFrame with original links and combined: {df.shape}') # expected 22
print(f'Number of whitepaper links combined out of {number_of_cryptos}: {df['combined_links'].count()}')

Dimensions of new DataFrame with original links and combined: (100, 22)
Number of whitepaper links combined out of 100: 84
