<a href="https://colab.research.google.com/github/scarfboy/wetsuite-dev/blob/main/examples/collocations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# For local installs you can install the package once.   In colab you get a disposable environment and will have to start with this install each time. 
!pip --quiet install https://github.com/scarfboy/wetsuite-dev/archive/refs/heads/main.zip

[K     - 217 kB 2.3 MB/s
[K     |████████████████████████████████| 53 kB 151 kB/s 
[K     |████████████████████████████████| 1.1 MB 65.7 MB/s 
[K     |████████████████████████████████| 4.7 MB 49.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
[K     |████████████████████████████████| 182 kB 81.9 MB/s 
[?25h  Building wheel for wetsuite (setup.py) ... [?25l[?25hdone


In [1]:
import re

import bs4 # BeautifulSoup is a handy way of scraping some text from HTML or XML

import wetsuite.helpers.net
import wetsuite.helpers.strings
import wetsuite.phrases.collocation


In [2]:

# Fetch Burgerlijk wetboek 7 from KOOP respositories, in XML form
#   NOTE: at ~60k words this is still somewhat small for this kind of analysis.
bwb7_xml = wetsuite.helpers.net.download('https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0005290/2008-03-26_0/xml/BWBR0005290_2008-03-26_0.xml')
soup = bs4.BeautifulSoup(bwb7_xml) 
sents = []
for al in soup.select('lid al'):
    sents.append(  ' '.join( al.find_all(text=True) )  )

In [3]:
coll = wetsuite.phrases.collocation.Collocation(
    # words to not allow at edges of phrases (optional) - this lets us keep the n-gram count down, while still including phrases that _contain_ them
    connectors='de een het  dat die   van voor met in op bij om   en of   is   aan  ook   je ik we'.split() 
)

print( "Counting" )
for sent in sents:
    coll.consume_tokens( wetsuite.helpers.strings.simple_tokenize(sent), gramlens=(2,3,4,5) )

print( "Cleanup")
print( '    before:', coll.counts() )
#coll.cleanup_unigrams(mincount=3)
coll.cleanup_grams(mincount=7) # remove n-gram sequences that didn't occur very much, for cleaner results
print( '     after:', coll.counts() )

top = 250
print( "Scoring, showing top %d\n"%top)
scores = coll.score_grams( )
print( ' %9s   %55s    %12s %20s'%('score', 'n-gram', 'n-gram count', 'individual counts') )
for strtup, score,  tup_count, uni_counts in scores[-top:]:
    print( ' %9.3f   %55s    %12s %20s=%d'%(score, ' '.join(strtup),   tup_count, '*'.join(str(n) for n in uni_counts), wetsuite.phrases.collocation.product(uni_counts)) )

Counting
Cleanup
    before: {'from_tokens': 63861, 'uni': 4028, 'grams': 74489}
     after: {'from_tokens': 63861, 'uni': 4028, 'grams': 736}
Scoring, showing top 250

     score                                                    n-gram    n-gram count    individual counts
    11.994                                                     lid 5              17               628*47=29516
    12.122                                   Wetboek van Burgerlijke               9             9*3537*9=286497
    12.154                                                   319 lid               9               13*628=8164
    12.180                                              tijdstip zal               7                77*64=4928
    12.187                                             De grondkamer              19               576*63=36288
    12.288                                         worden uitgevoerd               9               425*19=8075
    12.491                                             

In [13]:
import random
import wetsuite.helpers.koop_parse
import wetsuite.helpers.etree
import wetsuite.datasets
bwb_xml = wetsuite.datasets.load('bwb-mostrecent-xml')

bwb_text = []
bwb_urls = bwb_xml.data.keys()
for bwb_url in random.sample(bwb_urls, 5000):
    bytestring = bwb_xml.data.get( bwb_url )
    tree = wetsuite.helpers.etree.fromstring( bytestring )
    text = wetsuite.helpers.koop_parse.bwb_toestand_text(tree)
    bwb_text.append( text )
print('DONE fetching text from %d items'%len(bwb_text))

  if artikel.find('lid'):


DONE fetching text from 5000 items


In [None]:
import tqdm
import wetsuite.phrases.collocation
import wetsuite.helpers.spacy

coll = wetsuite.phrases.collocation.Collocation(
    # words to not allow at edges of phrases (optional) - this lets us keep the n-gram count down, while still including phrases that _contain_ them
    connectors='de een het  dat die   van voor met in op bij om   en of   is   aan  ook   je ik we'.split() 
)


for text in tqdm.tqdm(bwb_text):
    if len(text) >= 1000000: # spacy refuses (without you upping the limit) because it would take more GPU RAM
        continue
    sents = wetsuite.helpers.spacy.sentence_split( text, as_plain_sents=True )
    for sent in sents:
        coll.consume_tokens( simple_tokenize(sent), gramlens=(2,3,4,5) )

In [None]:
print( "Cleanup")
print( '    before:', coll.counts() )
#coll.cleanup_unigrams(mincount=3)
coll.cleanup_grams(mincount=7) # remove n-gram sequences that didn't occur very much, for cleaner results
print( '     after:', coll.counts() )

In [23]:
top = 2000
print( "Scoring, showing top %d\n"%top)
scores = coll.score_grams( )
print( ' %9s   %55s    %12s %20s'%('score', 'n-gram', 'n-gram count', 'individual counts') )
for strtup, score,  tup_count, uni_counts in scores[-top:]:
    print( ' %9.3f   %55s    %12s %20s=%d'%(score, ' '.join(strtup),   tup_count, '*'.join(str(n) for n in uni_counts), wetsuite.phrases.collocation.product(uni_counts)) )

Scoring, showing top 2000

     score                                                    n-gram    n-gram count    individual counts
    97.085                                   deelnemend rechtsgebied              15               17*167=2839
    97.091                            Bureau Rijvaardigheidsbewijzen              47               593*47=27871
    97.452                                         ijkmerk voorziene              11               13*117=1521
    97.497                              gesecuritiseerde vorderingen              47               61*455=27755
    97.566                                            Opmerking Bijv              12               113*16=1808
    97.588                                       Inland AIS apparaat               7             8*23*117=21528
    97.645                                             getrouw beeld              22               33*184=6072
    97.687                                   inzake rijksbelastingen             249   