In [18]:
import spacy
import re
from tqdm.auto import tqdm
from scraper import Web, Site

In [2]:
nlp = spacy.load('en_core_web_lg') # if this doesn't work, run spacy download en_core_web_lg

# Experiments

In [3]:
eg_site = Site.from_url('https://www.zurichmarkets.com/')
eg_site

Site at https://www.zurichmarkets.com/ with 30 links

In [4]:
eg_text = eg_site.soup.body.get_text()
'Zurich Markets' in eg_text

True

In [5]:
doc = nlp(eg_text)
set(ent.label_ for ent in doc.ents)

{'CARDINAL',
 'DATE',
 'GPE',
 'LANGUAGE',
 'NORP',
 'ORG',
 'PRODUCT',
 'WORK_OF_ART'}

In [6]:
set(ent.text.strip() for ent in doc.ents if ent.label_ == 'ORG')

{'AML PolicyFunding MethodsBonus Offerings',
 'Android Trader',
 'CFD',
 'CryptoEducation',
 'FX',
 'Ipad Trader',
 'Iphone Trader',
 'Login',
 'Lowest',
 "More.- World's",
 'PolicyTerm & ConditionsDeposit Policy',
 'TraderAndroid TraderiPhone Trader',
 'UsWhy',
 'Zurich Markets',
 'Zurich Markets Limited',
 'Zurich Markets Web & Mobile Platforms',
 'Zurich Markets’s',
 'the Zurich Markets Trading Specialist-'}

In [7]:
set(ent.text.strip() for ent in doc.ents if ent.label_ == 'PERSON')

set()

In [8]:
set(ent.text.strip() for ent in doc.ents if ent.label_ == 'PRODUCT')

{'Crypto', 'Explorer', 'MT4'}

In [9]:
set(ent.text.strip() for ent in doc.ents if ent.label_ == 'LANGUAGE')

{'English'}

In [10]:
'Support@zurichmarkets.com' in eg_text

True

In [None]:
re.findall(r'[\w\.-]+@[\w\.-]+', eg_text)

# Actual thing

In [11]:
web = Web.from_zip('./outputs/iosco.gz')
web

Web with 2605 sites

In [16]:
interesting_sites = [site for site in tqdm(web) if site.soup.body is not None]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2605.0), HTML(value='')))




In [17]:
interesting_labels = ['PERSON', 'PRODUCT', 'ORG']
set(
    ent.text
    for site in tqdm(interesting_sites)
    for ent in nlp(site.soup.body.get_text()).ents
    if ent.label_ in interesting_labels
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2436.0), HTML(value='')))




{'Kuvera France',
 'est conçu pour vous permettre de profiter de tous vos',
 'Fundación Universitaria San Pablo',
 'SvenskaSV',
 'Handeln Sie mit',
 'practicar\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLo que',
 'Golden Water Ventures Ltd',
 'White Rock Partners Ltd',
 'Mobile Money Ltd',
 'załatwianie sprawPowrótSposoby przyjmowania',
 'para invertir',
 'Financial and Consumer Affairs Authority of Saskatchewan',
 'Dlaczego',
 'przewidziane są specjalne bonusy',
 'bardzo dobrze',
 'das Münchner',
 'Zdraví\n\n\n\n\n\n\n\n\n\n',
 'Metale\n\n\n\n\n\n',
 'protéger contre les arnaquesSe préparer à',
 'mi się strona',
 'dado o de cualquier manera',
 'Platformę Transakcyjną',
 'Zweigniederlassung Frankfurt am Main',
 'przekonaniem stwierdzić',
 'Zostań Partnerem',
 'Platinum Cfd',
 'że na rachunku znajdują się wystarczające środki',
 'Latitude Investment Management Llp',
 'Gordan Ramsey',
 'Normativa',
 'Życzę InstaForex dalszego',
 'KomisjaSkład KomisjiZadania KomisjiDziennik',
 'GEGEN DIE',
 'Indi