#### Import Dependencies

In [None]:
import requests
from bs4 import BeautifulSoup
import json, os
import pandas as pd
from tqdm import tqdm
import traceback
import PyPDF2
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None

#### Collect Crypto Data
- Get data for top 5k crypto by marketcap from coinmarketcap.com
- Save data to a file (Prevent usage of Api limit)

In [None]:

def get_top_5k_cryptos() -> pd.DataFrame:
    
    api_key = os.getenv("API_KEY")
    url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'

    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': api_key,
    }
    
    params = {
        'start': '1',
        'limit': '5000',  # You can adjust this to get more or fewer listings
        'convert': 'USD',
        'sort': 'market_cap'
    }
    
    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    if response.status_code == 200:
        df = pd.DataFrame(data['data'])
        return df
        print(f'Data Saved..')
    else:
        print(f"Error {response.status_code}: {data['status']['error_message']}")
        
# df = get_top_5k_cryptos()
# df['whitepaper_link'] = ''
# df.to_csv('../data/topk_crypto.csv', index=False)
# print('Collected top 5k crypto.')

#### Load the dataset

In [None]:

df = pd.read_csv('../data/topk_crypto_w_whitepaper_link.csv')
print(f'Dimensions of the dataset: {df.shape}')
display(df.sample(1))


#### Get whitepaper links from cmc  for each of the topk cryptos

In [None]:
def get_whitepaper_link(slug):
    url = f'https://coinmarketcap.com/currencies/{slug}/'
    response = requests.get(url)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    divs = soup.find_all('div', class_='sc-d1ede7e3-0 sc-7f0f401-0 gRSwoF gQoblf')
    
    for div in divs:
        if 'whitepaper' not in div.get_text().lower():
            continue
        
        # Find the whitepaper link within this div
        for a_tag in div.find_all('a', href=True):
            return a_tag['href']
    
    return None

tqdm.pandas()
# df['whitepaper_cmc_link'] = df['whitepaper_link'].apply(lambda x: x if pd.notnull(x) and str(x).strip() else None)
# df['whitepaper_cmc_link'] = df.progress_apply(lambda row: get_whitepaper_link(row['slug']) if pd.isnull(row['whitepaper_link']) else row['whitepaper_link'], axis=1)

#### Get whitepaper links from whitepaper.io for each of the topk cryptos

In [None]:
def get_whitepaper_pdf_link(slug):
    try:
        # Step 1: Make the first API call to get the coin ID
        url_1 = f'https://api-new.whitepaper.io/coins?slug={slug}'
        response_1 = requests.get(url_1)
        response_1.raise_for_status()
        
        data_1 = response_1.json()
        if '_id' in data_1:
            coin_id = data_1['_id']
        else:
            print("ID attribute not found in the coin response")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch coin data: {e}")
        return None
    except ValueError:
        print("Failed to parse JSON response for coin data")
        return None

    try:
        # Step 2: Make the subsequent API call to get the document key
        url_2 = f'https://api-new.whitepaper.io/documents?id={coin_id}'
        response_2 = requests.get(url_2)
        response_2.raise_for_status()

        data_2 = response_2.json()
        if data_2 and isinstance(data_2, list):
            pdf_data = data_2[0]
            if 'key' in pdf_data:
                document_key = pdf_data['key']
            else:
                print("Key attribute not found in the document response")
                return None
        else:
            print("Document data is not in the expected format")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch document data: {e}")
        return None
    except ValueError:
        print("Failed to parse JSON response for document data")
        return None

    try:
        # Step 3: Construct the PDF link
        pdf_link = f'https://api-new.whitepaper.io/documents/pdf?id={document_key}'
        return pdf_link
    except Exception as e:
        print(f"Error constructing PDF link: {e}")
        return None



In [None]:
df['whitepaper_io_link'][:500] = df[:500].progress_apply(lambda row: row['whitepaper_io_link'] if pd.notnull(row['whitepaper_io_link']) and str(row['whitepaper_io_link']).strip() else get_whitepaper_pdf_link(row['slug']), axis=1)

In [None]:
df['whitepaper_io_link'].nunique()

In [None]:
df.to_csv(f'../data/topk_crypto_w_whitepaper_link.csv', index=False)

#### Loading dataset with whitepaper links

In [None]:
df = pd.read_csv('../data/topk_crypto_w_whitepaper_link.csv')
extend_df = pd.read_csv('../data/whitepaper_link_corrections.csv')


df = df.merge(extend_df, on='slug', how='left', suffixes=('', '_extend'))
# Override the 'whitepaper_link' column with the values from 'whitepaper_link_extend'
df['whitepaper_link'] = df['whitepaper_link_extend'].combine_first(df['whitepaper_link'])
# Drop the extended 'whitepaper_link' column
df = df.drop(columns=['whitepaper_link_extend'])

df = df.drop(columns = ['tags', 'quote'])
df = df[df['whitepaper_link'].notna()]
print(f'Shape of dataframe: ', df.shape)
print(f'Number of whitepaper links: ', df.whitepaper_link.nunique())
display(df.head(5))


#### Download Content

In [None]:
# Function to verify PDF structure
def verify_pdf(file_name):
    try:
        with open(file_name, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            # Check if the PDF has at least one page
            if len(reader.pages) > 0:
                return True
            else:
                return False
    except Exception as e:
        print(f'An error occurred while verifying {file_name}. Error: {e}')
        # traceback.print_exc()
        return False
    
# Function to download PDF
def fetch_webpage_content(url, file_name):
    try:
        response = requests.get(url)
        
        if response.status_code == 200:
            content = response.content
            
            if 'pdf' in file_name:
                with open(file_name, 'wb') as file:
                    file.write(content)
                    
                if not verify_pdf(file_name):
                    invalid_file_name = file_name.replace('.pdf', '_invalid.pdf')
                    os.rename(file_name, invalid_file_name)
                
            else:
                soup = BeautifulSoup(content, 'html.parser', from_encoding="iso-8859-1")
                content = soup.get_text(separator='\n')
                
                with open(file_name, 'w') as f:
                    f.write(content)
                
        else:
            print(f'Failed to download {file_name}, with url: {url}')
    except Exception as e:
        print(f'An error occurred while downloading {file_name}, with url: {url}, Error is: {e}')
        
        

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    url = row['whitepaper_link']
    type = 'pdf' if 'pdf' in url else 'txt'
    file_name = f"../data/whitepapers/{index + 1}_{row['slug']}.{type}"
    
    if type == 'txt' and os.path.exists(file_name):
        os.remove(file_name)
    
    if not os.path.exists(file_name):
        fetch_webpage_content(url, file_name)
        
        

    