In [1]:
import random
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from scraper import Web, Site, parallel_map, find_urls

In [2]:
with open('./sources/finma.html', 'r') as iosco_source:
    html = iosco_source.read()
html[:20]

'<!DOCTYPE html>\n<!--'

In [3]:
soup = BeautifulSoup(html, 'lxml')
table = soup.find_all('table')[0]

rows = [
    row
    for row in [
        row.find_all('td')
        for row in table.find_all('tr')
    ]
    if len(row) > 0
]
metadata = [
    dict(
        company_name=row[0].a.get_text(),
        report_url=row[0].a['href']
    )
    for row in rows
]
metadata = metadata[:64] # so that we fit in memory limits
metadata[:4]

[{'company_name': '10CryptoMarket',
 {'company_name': '1A-CREDIT-now AG',
 {'company_name': '1APayment AG',
 {'company_name': '1oakmg Sagl',

In [4]:
def get_homepage_url(report_url):
    try:
        result = pd.read_html(report_url)[0][1].iloc[3]
        if not isinstance(result, str):
            return ''
        return result
    except:
        return ''
    
get_homepage_url('https://www.finma.ch/en/finma-public/warning-list/10cryptomarket/')

'https://www.10cryptomarket.com/'

In [5]:
homepage_urls = [
    url
    for url_text in parallel_map(
        get_homepage_url,
        [info['report_url'] for info in metadata]
    )
    for url in find_urls(url_text)
]
homepage_urls[:4]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=64.0), HTML(value='')))




['http://www.10cryptomarket.com/',
 'http://www.1a-credit-now.ch',
 'http://www.1apcoin.com',
 'http://www.1apayment.com']

In [6]:
def get_scam_web(homepage_url):
    return Web.from_url(homepage_url, max_links=32, depth=2, parallel=False)

web = Web.merge(*parallel_map(
    get_scam_web,
    homepage_urls
))
web

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=58.0), HTML(value='')))




Web with 197 sites

In [7]:
web.to_zip('./intermediate/finma.gz')