In [2]:
import requests
from bs4 import BeautifulSoup
import json, os
import pandas as pd
from tqdm import tqdm
import traceback
import math, pprint

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None

NUMBER_OF_CYRYPTOS = 1100

### Get top 1100 Crypto Slugs & Ids from API.
- API URL: f'https://api-new.whitepaper.io/coins?page={index}'

In [4]:
def get_data_from_call(page_number, arr):

    # construct url to parse
    url = f'https://api-new.whitepaper.io/coins?page={page_number}'

    # construct headers
    headers = { 'User-Agent' : 'Mozilla/5.0' }

    # make request
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f'Error. Status code: {response.status_code}. Description: {response.reason}.')
        return

    json_dict = response.json()

    data = json_dict['result']

    # length of data is 10
    # iterate through data and extract id and slug
    for i in range(len(data)):
        slug = data[i]['slug']
        id = data[i]['id']
        temp = [slug, id]
        arr.append(temp)

    

# 10 cryptos displayed per page
num_pages = math.ceil(NUMBER_OF_CYRYPTOS / 10)

# build dataframe -> each row should have the 1) slug and  2) id
arr = []
columns = ['slug', 'id']

for page_number in tqdm(range(1, num_pages + 1)):
    get_data_from_call(page_number, arr)

df = pd.DataFrame(arr, columns=columns)

df.to_csv('../data/wpio_output.csv', index=False)

print(f'dimensions of data frame: {df.shape}') # should print (1100, 2)
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:53<00:00,  2.04it/s]

dimensions of data frame: (1100, 2)





#### Get the document key corresponding to each crypto from API.
• API URL: f'https://api-new.whitepaper.io/documents?id={coin_id}'

In [13]:
def get_document_key(id, arr):

    url = f'https://api-new.whitepaper.io/documents?id={id}'
    headers = { 'User-Agent' : 'Mozilla/5.0' }
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f'Error. Status code: {response.status_code}. Description: {response.reason}.')
        return

    data = response.json()

    # check if the page has a valid coin
    if (len(data) >= 1): 
        
        json_dict = data[0]
    
        key = json_dict['key']
    
        arr.append(key)

    else:
        arr.append(None)
    


tqdm.pandas()
df = pd.read_csv('../data/wpio_output.csv')
arr = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    id = row['id']
    get_document_key(id, arr)

df['document_key'] = arr
df.to_csv('../data/wpio_output_with_keys.csv', index=False)

# # TESTING
# test_id = '5ccb65e109b644000eadba30'
# test_arr = []
# get_document_key(test_id, test_arr)
# print(test_arr)




100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1100/1100 [08:25<00:00,  2.18it/s]


#### Download documents for each crypto. Link is of the form:
- f'https://api-new.whitepaper.io/documents/pdf?id={document_key}'
- Store the pdfs in the directory:
    - f'../whitepaper-io/{slug.pdf}'

In [28]:
def get_pdf_and_store(key, slug):

    url = f'https://api-new.whitepaper.io/documents/pdf?id={key}'
    headers = { 'User-Agent' : 'Mozilla/5.0' }
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f'Error. Status code: {response.status_code}. Description: {response.reason}.')
        return

    file_name = f'../whitepaper-io/{slug}.pdf'

    # write to a pdf
    with open(file_name, 'wb') as file:
        file.write(response.content)

df = pd.read_csv('../data/wpio_output_with_keys.csv')
missing_counter = 0

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    key = row['document_key']
    slug = row['slug']

    # checking if unable to fetch document
    if (pd.isna(key)):
        print(f'{slug} pdf could not be found.')
        missing_counter += 1
        continue

    
    get_pdf_and_store(key, slug)

print(f'done! total documents missing: {missing_counter} out of {NUMBER_OF_CYRYPTOS}') # 8 documents missing


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1100/1100 [00:00<00:00, 19152.31it/s]

8



