<a href="https://colab.research.google.com/github/scarfboy/wetsuite-dev/blob/main/examples/collocations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# For local installs you can install the package once.   In colab you get a disposable environment and will have to start with this install each time. 
!pip --quiet install https://github.com/scarfboy/wetsuite-dev/archive/refs/heads/main.zip

[K     - 217 kB 2.3 MB/s
[K     |████████████████████████████████| 53 kB 151 kB/s 
[K     |████████████████████████████████| 1.1 MB 65.7 MB/s 
[K     |████████████████████████████████| 4.7 MB 49.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
[K     |████████████████████████████████| 182 kB 81.9 MB/s 
[?25h  Building wheel for wetsuite (setup.py) ... [?25l[?25hdone


In [3]:
import re

def simple_tokenize(text):  # real NLP tokenizers are often more robust, but for a quick test we can avoid a big depdenency
    ' split string into words '
    l = re.split('[\s!@#$%^&*()"\':;/.,?\xab\xbb\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u2039\u203a\u2358\u275b\u275c\u275d\u275e\u275f\u2760\u276e\u276f\u2e42\u301d\u301e\u301f\uff02\U0001f676\U0001f677\U0001f678-]+', text)
    return list(e   for e in l  if len(e)>0)

In [4]:
import bs4 # BeautifulSoup is a handy way of scraping some text from HTML or XML
import wetsuite.helpers.net

# Fetch Burgerlijk wetboek 7 from KOOP respositories, in XML form.   At ~60k words this is somewhat small for this kind of analysis.
bwb7_xml = wetsuite.helpers.net.download('https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0005290/2008-03-26_0/xml/BWBR0005290_2008-03-26_0.xml')
soup = bs4.BeautifulSoup(bwb7_xml, features='xml') 
sents = []
for al in soup.select('lid al'):
    sents.append(  ' '.join( al.find_all(text=True) )  )

In [10]:
import wetsuite.phrases.collocation
coll = wetsuite.phrases.collocation.Collocation(
    # words to not allow at edges of phrases (optional) - this lets us keep the n-gram count down, while still allowing phrases that contain them
    connectors='de een het  dat die   van voor met in op bij om   en of   is   aan  ook   je ik we'.split() 
)

print( "Counting")
for sent in sents:
    coll.consume_tokens( simple_tokenize(sent), gramlens=(2,3,4,5) )

print( "Cleanup")
print( '    before:', coll.counts() )
#coll.cleanup_unigrams(mincount=3)
coll.cleanup_grams(mincount=7) # remove n-gram sequences that didn't occur very much, for cleaner results
print( '     after:', coll.counts() )

top = 250
print( "Scoring, showing top %d\n"%top)
scores = coll.score_grams( )
print( ' %9s   %55s    %12s %20s'%('score', 'n-gram', 'n-gram count', 'individual counts') )
for strtup, score,  tup_count, uni_counts in scores[-top:]:
    print( ' %9.3f   %55s    %12s %20s=%d'%(score, ' '.join(strtup),   tup_count, '*'.join(str(n) for n in uni_counts), wetsuite.phrases.collocation.product(uni_counts)) )


Counting
Cleanup
    before: {'from_tokens': 63861, 'uni': 4028, 'grams': 74489}
     after: {'from_tokens': 63861, 'uni': 4028, 'grams': 736}
Scoring, showing top 250

     score                                                    n-gram    n-gram count    individual counts
    11.994                                                     lid 5              17               628*47=29516
    12.122                                   Wetboek van Burgerlijke               9             9*3537*9=286497
    12.154                                                   319 lid               9               13*628=8164
    12.180                                              tijdstip zal               7                77*64=4928
    12.187                                             De grondkamer              19               576*63=36288
    12.288                                         worden uitgevoerd               9               425*19=8075
    12.491                                             