<a href="https://colab.research.google.com/github/scarfboy/wetsuite-dev/blob/main/examples/Collocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip --quiet install https://github.com/scarfboy/wetsuite-dev/archive/refs/heads/main.zip

In [None]:
import re

def simple_tokenize(text):  # spacy's tokenizer may well be be more robust, but for a quick test we can avoid that big depdenency
    ' split string into words '
    l = re.split('[\s!@#$%^&*()"\':;/.,?\xab\xbb\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u2039\u203a\u2358\u275b\u275c\u275d\u275e\u275f\u2760\u276e\u276f\u2e42\u301d\u301e\u301f\uff02\U0001f676\U0001f677\U0001f678-]+', text)
    return list(e   for e in l  if len(e)>0)

In [None]:
import bs4 # BeautifulSoup is a handy way of scraping some text from HTML or XML
import wetsuite.helpers.net

# Fetch Burgerlijk wetboek 7 from KOOP respositories, in XML form.   At ~60k words this is somewhat small for this kind of analysis.
bwb7_xml = wetsuite.helpers.net.download('https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0005290/2008-03-26_0/xml/BWBR0005290_2008-03-26_0.xml')
soup = bs4.BeautifulSoup(bwb7_xml, features='xml') 
sents = []
for al in soup.select('lid al'):
    sents.append(  ' '.join( al.find_all(text=True) )  )

In [None]:
import wetsuite.phrases.collocation
coll = wetsuite.phrases.collocation.Collocation(
    connectors='de een het  dat die   van voor met in op bij om   en of   is   aan  ook   je ik we'.split() # optional - things to not allow at edges of phrases, in case you want to  focus more on contentful phrases. 
)

print( "Counting")
for sent in sents:
    coll.consume_text( simple_tokenize(sent), gramlens=(2,3,4,5) )

print( "Cleanup")
print( '    before:', coll.counts() )
#coll.cleanup_unigrams(mincount=3)
coll.cleanup_grams(mincount=5)
print( '     after:', coll.counts() )

top = 250
print( "Scoring, showing top %d"%top)
scores = coll.score_grams( )
for strtup, score,  tup_count, uni_counts in scores[-top:]:
    print(' %9.3f   %50s    %20s %20s=%d'%(score, ' '.join(strtup),   tup_count, uni_counts, wetsuite.phrases.collocation.product(uni_counts)) )


Counting
Cleanup
    before: {'from_tokens': 63861, 'uni': 4028, 'grams': 74489}
     after: {'from_tokens': 63861, 'uni': 4028, 'grams': 1472}
Scoring, showing top 250
    24.868                                          binnen drie                       9            [105, 38]=3990
    25.146                                           dag waarop                      12            [61, 115]=7015
    25.578                                        andere partij                      13            [142, 57]=8094
    25.779                                           Wet arbeid                      10             [54, 88]=4752
    26.042                                       rechter wenden                       5             [168, 7]=1176
    26.143                                        eigen gebruik                       7             [28, 82]=2296
    26.166                            ontbinding overeenkomstig                       7             [37, 62]=2294
    26.469                       