In [None]:
# start with this when working in a disposable environment like google colab
!pip --quiet install https://github.com/scarfboy/wetsuite-dev/archive/refs/heads/main.zip

### Basic test

In [1]:
import bs4

import wetsuite.helpers.net
import wetsuite.phrases.abbreviations

In [2]:
# Unreasonably clean data,  that also contains some less usual cases so we can report what we might eventually want to deal with
html = wetsuite.helpers.net.download('https://organisaties.overheid.nl/Zelfstandige_bestuursorganen/') 

soup = bs4.BeautifulSoup( html ) # parse the webpage into something we can query
for link in soup.select('.content .list--linked li a'):   # we are interested in the link text:
    found = False
    for ab, words in wetsuite.phrases.abbreviations.find_abbrevs( link.text ):
        print( 'FOUND  %s = %s'%( ab, words ) )
        found = True

    # Things we didn't find - more creative things that we _might_ want to consider
    if '(' in link.text and not found:    # (assuming bracket indicates there is an explained abbreviation in that link text)
        print( "MISS  ", link.text )

ValueError: 500

In [17]:
# Similar idea, different site
html = wetsuite.helpers.net.download('https://publications.europa.eu/code/nl/nl-5000400.htm') 

soup = bs4.BeautifulSoup( html ) # parse the webpage into something we can query
for tr in soup.select('table.definitionsTable tr'):   # we are interested in the link text:
    tds = tr.findAll('td')

    # TODO: deal with the way it mentions multiple definitions
    text = '%s (%s)'%(tds[0].text.strip(), tds[1].text.strip())  # pretend we don't know this is good data and just put it next to each other

    found = False
    for ab, words in wetsuite.phrases.abbreviations.find_abbrevs( text ):
        print( 'FOUND  %s = %s'%( ab, words ) )
        found = True

    # Things we didn't find - more creative things that we _might_ want to consider
    if '(' in text and not found:    # (assuming bracket indicates there is an explained abbreviation in that link text)
        print( "MISS  ", text )



MISS   ABH (Agentschap voor Buitenlandse Handel (voorheen BDBH (Belgische Dienst voor Buitenlandse Handel)))
MISS   ABVV (Algemeen Belgisch Vakverbond)
MISS   ACS (staten in Afrika, het Caribisch gebied en de Stille Oceaan)
FOUND  ACV = ['Algemeen', 'Christelijk', 'Vakverbond']
MISS   ADB (1.
Afrikaanse Ontwikkelingsbank
(African Development Bank)
2.
Arabische Ontwikkelingsbank
(Arab Development Bank)
3.
Aziatische Ontwikkelingsbank
(Asian Development Bank))
MISS   ADN (Europese Overeenkomst betreffende het internationale vervoer van gevaarlijke goederen over de binnenwateren)
MISS   ADR (Europese Overeenkomst betreffende het internationale vervoer van gevaarlijke goederen over de weg)
MISS   Afnor (Frans Normalisatie-instituut
(Association française de normalisation))
MISS   ALO (algemene leningsovereenkomst)
MISS   Altener II (meerjarenprogramma ter bevordering van hernieuwbare energiebronnen in de Gemeenschap)
MISS   AKE (Agentschap voor Kernenergie (OESO))
FOUND  ANP = ['Algemeen',

In [13]:
# Similar idea, different site
html = wetsuite.helpers.net.download('https://www.rijksfinancien.nl/memorie-van-toelichting/2021/OWB/XIII/onderdeel/644956') 

soup = bs4.BeautifulSoup( html ) # parse the webpage into something we can query
for tr in soup.select('.kio2 tr'):   # we are interested in the link text:

    tds = tr.findAll('td')

    text = '%s (%s)'%(tds[0].text.strip(), tds[1].text.strip())  # pretend we don't know this is good data and just put it next to each other

    found = False
    for ab, words in wetsuite.phrases.abbreviations.find_abbrevs( text ):
        print( 'FOUND  %s = %s'%( ab, words ) )
        found = True

    # Things we didn't find - more creative things that we _might_ want to consider
    if '(' in text and not found:    # (assuming bracket indicates there is an explained abbreviation in that link text)
        print( "MISS  ", text )



MISS    ()
MISS   ACM (Autoriteit Consument en Markt)
FOUND  ACT = ['Accelerating', 'CCS', 'Technologies']
MISS   ACVG (Adviescollege Veiligheid Groningen)
FOUND  ANBI = ['Algemeen', 'nut', 'beogende', 'instellingen']
FOUND  AT = ['Agentschap', 'Telecom']
FOUND  ATR = ['Adviescollege', 'toetsing', 'regeldruk']
MISS   AWTI (Adviesraad voor Wetenschap, Technologie en Innovatie)
MISS   BBE (Biobased Economy)
FOUND  BBP = ['Bruto', 'Binnenlands', 'Product']
MISS   BES (Bonaire, Sint Eustatius, Saba)
MISS   BIS (Basisinfrastructuur voor cultuur)
MISS   BIPM (Bureau International des Poids en Mesures)
MISS   BMKB (Borgstellingsregeling Midden en Kleinbedrijf)
FOUND  BNP = ['Bruto', 'Nationaal', 'Product']
FOUND  BOM = ['Brabantse', 'Ontwikkelings', 'Maatschappij']
MISS   BPM (Belasting van personenauto's en motorrijwielen)
MISS   BTW (Belasting over de toegevoegde waarde)
MISS   BZ (Ministerie van Buitenlandse Zaken)
MISS   BZK (Ministerie van Binnenlandse Zaken en Koninkrijksrelaties)
MISS 

In [9]:

html = wetsuite.helpers.net.download('https://juridisch-woordenboek.nl/afkortingen') 

soup = bs4.BeautifulSoup( html ) # parse the webpage into something we can query
for tr in soup.select('table#afkortingen tbody tr'):
    #print(tr)
    tds = tr.findAll('td')

    text = '%s (%s)'%(tds[0].text.strip(), tds[1].text.strip())  # pretend we don't know this is good data and just put it next to each other

    found = False
    for ab, words in wetsuite.phrases.abbreviations.find_abbrevs( text ):
        print( 'FOUND  %s = %s'%( ab, words ) )
        found = True

    # Things we didn't find - more creative things that we _might_ want to consider
    if '(' in text and not found:    # (assuming bracket indicates there is an explained abbreviation in that link text)
        print( "MISS  ", text )



FOUND  AA = ['Ars', 'Aequi']
FOUND  AA = ['Accountant', 'Administratieconsulent']
FOUND  AA = ['Advertising', 'Association']
MISS   a.a. (ad acta, bij de akten (wegleggen))
FOUND  AAA = ['American', 'Arbitration', 'Association']
MISS   AAC (Advies- en Arbitragecommissie)
MISS   AAf (Algemeen Arbeidsongeschiktheidsfonds)
MISS   AAR (Algemeen ambtenarenreglement)
MISS   AAR (Algemene Aanwijzingen voor de Rijksdienst)
FOUND  AAV = ['Algemene', 'administratieve', 'voorschriften']
MISS   AAW (Algemene Arbeidsongeschiktheidswet)
FOUND  AB = ['Administratiefrechterlijke', 'Beslissingen']
MISS   AB (Administratieve en Rechterlijke Beslissingen)
MISS   AB (Nederlandse Jurisprudentie Administratiefrechtelijke Beslissingen (sinds 1971))
MISS   AB (Wet Algemene Bepalingen)
FOUND  ABA = ['American', 'Bar', 'Association']
MISS   ABAR (Algemene bepalingen van administratief recht)
MISS   abbb (algemene beginselen van behoorlijk bestuur)
FOUND  ABP = ['Algemeen', 'Burgerlijk', 'Pensioenfonds']
MISS   

In [11]:
# Similar idea, different site

html = wetsuite.helpers.net.download('https://www.eur.nl/esl/campus/sanders-law-library/juridische-afkortingen') 

soup = bs4.BeautifulSoup( html ) # parse the webpage into something we can query

for tr in soup.select('div.accordion table tr'):
    tds = tr.findAll('td')

    text = '%s (%s)'%(tds[1].text.strip(), tds[0].text.strip())  # pretend we don't know this is good data and just put it next to each other

    found = False
    for ab, words in wetsuite.phrases.abbreviations.find_abbrevs( text ):
        print( 'FOUND  %s = %s'%( ab, words ) )
        found = True

    # Things we didn't find - more creative things that we _might_ want to consider
    if '(' in text and not found:    # (assuming bracket indicates there is an explained abbreviation in that link text)
        print( "MISS  ", text )






MISS   anno, in het jaar (a°)
MISS   Algemene bepalingen (A)
MISS   Antwoord der regering naar aanleiding van het verslag (A)
MISS   Arbeid; afzonderlijk verschenen van 1946-1953 (A)
MISS   Atlantic Reporter second series (A.2d.)
MISS   Accountancy en Bedrijfskunde (A&B)
MISS   Aansprakelijkheid en Verzekering (A&V)
MISS   Ars Aequi. Juridisch studentenblad (AA of A.A. of AAe)
MISS   Accountant-Administratieconsulent (AAC)
FOUND  AA = ['Advertising', 'Association']
MISS   ad acta, bij de akten (wegleggen) (a.a)
FOUND  AAA = ['American', 'Arbitration', 'Association']
MISS   Algemeen aanduidingenbesluit (AAB)
MISS   Algemene aannemingsvoorwaarden voor bedrijfsgebouwen in de landbouw (AABL)
MISS   Advies- en Arbitragecommissie (AAC)
MISS   Ars Aequi. Juridisch studentenblad (A Ae)
MISS   Algemeen arbeidsongeschiktheidsfonds (AAF of Aaf)
MISS   Adem-alcoholgehalte (AAG)
MISS   Ars Aequi jurisprudentiebundel (AA-Jur)
MISS   Algemene aannemingsvoorwaarde voor de kassenbouw (AAK)
FOUND  AAK =

### Run on a bunch of free-form document text

In [18]:
import wetsuite.phrases.abbreviations
import wetsuite.datacollect.db
import wetsuite.helpers.etree

conn = wetsuite.datacollect.db.connect()
curs = conn.cursor()

In [19]:

curs.execute('SELECT plaintext  FROM cvdr  LIMIT 10000') # TODO: change to wetsuite.data.load() after creating a dataset
per_doc_results = []
for text, in curs:
    result = wetsuite.phrases.abbreviations.find_abbrevs(text)
    #for ab, words in result: # individual results
    #    print( ab, words )
    per_doc_results.append( result )    


### test the cleaning - report only things that were explained the same way in two or more documents
min_doc_occur = 2

report = []
abbrev_count = wetsuite.phrases.abbreviations.count_results( per_doc_results )
for abbrev, words_count in abbrev_count.items():
    for words, count in words_count.items():
        if count >= min_doc_occur:   # the point of that structure: being able to ignore rarer explanations
            report.append( (abbrev, count, ' '.join(words) ) )

report.sort(key=lambda tup: -tup[1]) # sort by count descending
#report.sort(key=lambda tup: (tup[0], -tup[1])) # sort/group by abbreviation alphabetically, then by count descending
for abbrev, count, expl in report:
    print( '%10s   %3d:   %s'%( abbrev, count, expl ) )

       Awb   345:   Algemene wet bestuursrecht
      Wabo    88:   Wet algemene bepalingen omgevingsrecht
       Wmo    60:   Wet maatschappelijke ondersteuning
       Wlz    59:   Wet langdurige zorg
       APV    46:   Algemene Plaatselijke Verordening
       CAK    35:   Centraal Administratie Kantoor
       Wro    35:   Wet ruimtelijke ordening
       Wgs    34:   Wet gemeentelijke schuldhulpverlening
       LRK    33:   Landelijk Register Kinderopvang
       VOG    31:   verklaring omtrent gedrag
       AVG    31:   Algemene Verordening Gegevensbescherming
       APV    28:   Algemene plaatselijke verordening
       CIZ    25:   Centrum Indicatiestelling Zorg
       VNG    25:   van Nederlandse Gemeenten
       ZIN    25:   zorg in natura
       VOG    24:   Verklaring Omtrent Gedrag
       NHG    23:   Nationale Hypotheek Garantie
        Wm    22:   Wet milieubeheer
       Apv    19:   Algemene plaatselijke verordening
      WSNP    18:   Wet Schuldsanering Natuurlijke Personen


In [20]:

curs.execute('SELECT plaintext  FROM bwb  LIMIT 10000') # TODO: change to wetsuite.data.load() after creating a dataset
per_doc_results = []
for text, in curs:
    result = wetsuite.phrases.abbreviations.find_abbrevs(text)
    #for ab, words in result: # individual results
    #    print( ab, words )
    per_doc_results.append( result )    


### test the cleaning - report only things that were explained the same way in two or more documents
min_doc_occur = 2

report = []
abbrev_count = wetsuite.phrases.abbreviations.count_results( per_doc_results )
for abbrev, words_count in abbrev_count.items():
    for words, count in words_count.items():
        if count >= min_doc_occur:   # the point of that structure: being able to ignore rarer explanations
            report.append( (abbrev, count, ' '.join(words) ) )

report.sort(key=lambda tup: -tup[1]) # sort by count descending
#report.sort(key=lambda tup: (tup[0], -tup[1])) # sort/group by abbreviation alphabetically, then by count descending
for abbrev, count, expl in report:
    print( '%10s   %3d:   %s'%( abbrev, count, expl ) )

      ETSI    17:   Europese Telecommunicatie Standaardisatie Instituut
      Cotg    14:   Centraal orgaan tarieven gezondheidszorg
       IMO     8:   Internationale Maritieme Organisatie
       IEC     8:   Internationale Electrotechnische Commissie
       VNG     7:   van Nederlandse Gemeenten
       NNI     7:   Nederlands Normalisatie Instituut
       ONP     5:   Open Network Provision
      BTZR     4:   Besluit tegemoetkoming ziektekosten rijkspersoneel
      AWVN     4:   Algemene Werkgeversvereniging VNO NCW
       NFP     4:   Nederlands Fellowships Programma
       AVR     4:   aanvraag vaststelling rijksvergoeding
       MAL     3:   Mededeling aan Luchtvarenden
      KNIL     3:   Koninklijk Nederlands Indonesisch Leger
       TNO     3:   Toegepast Natuurwetenschappelijk Onderzoek
       TNO     3:   toegepast natuurwetenschappelijk onderzoek
      ICPR     3:   Interdepartementale Coördinatievergadering Personeelsbeleid Rijksdienst
       IEC     3:   Internationale El

In [None]:

import wetsuite.datasets
wetsuite.datasets.load('bwb-textonly')

wetsuite.datasets.load('cvdr-textonly')
