# The Scraper

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import tqdm
from concurrent.futures import ThreadPoolExecutor

In [9]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
base_url = 'https://coindb-prod.ocean.icm.edu.pl/AFE_PL/show_coin?coin_id='
start_id = 47
end_id = 42968

In [10]:
def parse_coin_html_page(soup: BeautifulSoup, coin_id: int, link: str) -> dict:
    coin_info = {}
    coin_info["find_id"] = coin_id

    css_selectors = {
        "admin_division": "#container > table > tbody > tr:nth-child(1) > td:nth-child(2) > input[type=text]",
        "place": "#container > table > tbody > tr:nth-child(2) > td:nth-child(2) > input[type=text]",
        "period": "#container > table > tbody > tr:nth-child(4) > td:nth-child(2) > input[type=text]",
        "denomination": "#container > table > tbody > tr:nth-child(5) > td:nth-child(2) > input[type=text]",
        "material": "#container > table > tbody > tr:nth-child(6) > td:nth-child(2) > input[type=text]",
        "issuer": "#container > table > tbody > tr:nth-child(7) > td:nth-child(2) > input[type=text]",
        "mint": "#container > table > tbody > tr:nth-child(11) > td:nth-child(2) > input[type=text]",
        "start_year": "#date_from",
        "end_year": "#date_to",
    }

    for key, selector in css_selectors.items():
        coin_info[key] = soup.select_one(selector).get("value")
    
    coin_info["imported"] = datetime.date.today().strftime("%d/%m/%Y")
    coin_info["owner"] = "FRC-PL"
    coin_info["reference"] = link

    return coin_info

In [11]:
def get_coin_info(coin_id: int):
    url = base_url + str(coin_id)
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    return parse_coin_html_page(soup, coin_id, url)

In [12]:
def safe_get_coin_info(coin_id: int):
    try:
        return get_coin_info(coin_id)
    except Exception as e:
        return {}

In [13]:
# multithread the process with tqdm progress bar
with ThreadPoolExecutor(max_workers=16) as executor:
    coins_info = list(
        tqdm.tqdm(
            executor.map(safe_get_coin_info, range(start_id, end_id + 1)),
            total=end_id - start_id + 1,
        )
    )

100%|██████████| 42922/42922 [31:30<00:00, 22.70it/s]  


In [14]:
# remove empty dicts
coins_info = [coin_info for coin_info in coins_info if coin_info]
pd.DataFrame(coins_info).to_csv("coins.csv", index=False, encoding="utf-8")