In [1]:
from bs4 import BeautifulSoup
import pickle
from tqdm.auto import tqdm
from scraper import Web, Site, parallel_map

import plotly.graph_objects as go

In [2]:
with open('./sources/iosco.html', 'r') as iosco_source:
    html = iosco_source.read()
html[:20]

'<!DOCTYPE html>\n<htm'

In [3]:
soup = BeautifulSoup(html, 'lxml')
table = soup.find_all('table')[0]

rows = [
    row
    for row in [
        row.find_all('td')
        for row in table.find_all('tr')
    ]
    if len(row) > 0
]
metadata = [
    dict(
        company_name=next(row[0].children).strip(),
        jurisdiction=next(row[2].children).strip(),
        report_date=next(row[3].a.children),
        report_url=row[3].a['href'],
    )
    for row in rows
]
metadata[:4]

[{'company_name': 'COOPER MARKETS s.r.o',
  'jurisdiction': 'Czech Republic',
  'report_date': '27 Nov 2020',
  'report_url': 'https://www.cnb.cz/en/supervision-financial-market/consumer-protection-and-financial-literacy/consumer-protection/notices-about-activities/Notice-about-the-activities-of-COOPER-MARKETS-s.r.o./'},
 {'company_name': 'Zurich Markets Limited',
  'jurisdiction': 'Switzerland',
  'report_date': '27 Nov 2020',
 {'company_name': 'Inquot Investing Group',
  'jurisdiction': 'Switzerland',
  'report_date': '27 Nov 2020',
 {'company_name': 'iAlphagroup',
  'jurisdiction': 'Portugal',
  'report_date': '26 Nov 2020',
  'report_url': 'https://www.cmvm.pt/en/SDI/FinancialIntermediaries/Pages/20201126.aspx?v='}]

In [4]:
def get_site(url):
    site = Site.from_url(url, ignore_errors=True)
    return site

report_sites = [
    site
    for site in parallel_map(
        get_site,
        [info['report_url'] for info in metadata]
    )
    if site is not None
]
report_sites[:4]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=482.0), HTML(value='')))




[Site at https://www.cnb.cz/en/supervision-financial-market/consumer-protection-and-financial-literacy/consumer-protection/notices-about-activities/Notice-about-the-activities-of-COOPER-MARKETS-s.r.o./ with 135 links,
 Site at https://www.cmvm.pt/en/SDI/FinancialIntermediaries/Pages/20201126.aspx?v= with 99 links]

In [5]:
def get_scam_web(report_site):
    return Web.merge(*[
        Web.from_url(link, depth=1, parallel=False)
        for link in report_site.external_links
    ])

web = Web.merge(*parallel_map(
    get_scam_web,
    [
        site
        for site in report_sites
        if len(site.external_links) < 64
    ]
))
web

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=472.0), HTML(value='')))




Web with 265 sites

# Limit results
So that we fit with GitHub and Vercel memory limits

In [6]:
polish_sites = [site for site in tqdm(web) if site.lang == 'pl']
len(polish_sites)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=265.0), HTML(value='')))




14

In [7]:
num_extra_sites = max(256 - len(polish_sites), 0)
web = Web(
    list(web.sites)[:num_extra_sites]
    + polish_sites
)
web

Web with 243 sites

In [8]:
web.to_zip('./intermediate/iosco.gz')