<a href="https://colab.research.google.com/github/liao961120/dcctk/blob/main/docs_source/nb/ngrams_collocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!gdown https://github.com/liao961120/dcctk/raw/main/test/data.zip
!unzip -q data.zip
!pip install -q https://github.com/liao961120/CompoTree/tarball/main
!pip install -qU dcctk

In [7]:
from dcctk.corpusReader import PlainTextReader
from dcctk.concordancer import Concordancer

c = Concordancer(PlainTextReader("data/").corpus)

Indexing corpus for text retrival...


  0%|          | 0/5 [00:00<?, ?it/s]

Indexing corpus for concordance search...


  0%|          | 0/5 [00:00<?, ?it/s]

## Search by phonetic properties in 廣韻

In [None]:
c.cql_attrs['CharPhonetic']

{'moe': ['phon', 'tone', 'tp', 'sys="moe"'],
 '廣韻': ['攝', '聲調', '韻母', '聲母', '開合', '等第', '反切', '拼音', 'IPA', 'sys="廣韻"']}

In [None]:
cql = '''
[韻母="東" & 聲調="平" & sys="廣韻"]
'''
for i, r in enumerate(c.cql_search(cql)):
    if i == 5: break
    print(r)

<Concord 朗令終，顯{融}昭明，命姓>
<Concord 東方，得祝{融}而辯於南方>
<Concord 為土師。祝{融}辨乎南方，>
<Concord 自憐。使祝{融}兮先行，令>
<Concord 帝，其神祝{融}。其蟲羽。>


## Ngram Frequency

In [4]:
c.freq_distr_ngrams(n=2, subcorp_idx=0).most_common(10)

Counting 2-grams...


  0%|          | 0/5 [00:00<?, ?it/s]

[('而不', 3913),
 ('天下', 3661),
 ('不可', 2985),
 ('之所', 2723),
 ('子曰', 2581),
 ('人之', 2317),
 ('以為', 2231),
 ('所以', 2023),
 ('不能', 1934),
 ('可以', 1667)]

In [5]:
c.freq_distr_ngrams(n=3, subcorp_idx=0).most_common(10)

Counting 3-grams...


  0%|          | 0/5 [00:00<?, ?it/s]

[('天下之', 946),
 ('歧伯曰', 766),
 ('之所以', 605),
 ('不可以', 580),
 ('子對曰', 443),
 ('黃帝曰', 403),
 ('此之謂', 350),
 ('子墨子', 343),
 ('孔子曰', 302),
 ('不可不', 298)]

## Collocations

### Bigram Association

In [4]:
bi_asso = c.bigram_associations(subcorp_idx=0, sort_by="DeltaP21")
[x for x in bi_asso if x[1].get('RawCount', 0) > 100][:10]

Counting 2-grams...


  0%|          | 0/5 [00:00<?, ?it/s]

[('歧伯',
  {'DeltaP12': 0.515988379165576,
   'DeltaP21': 0.9870480872578454,
   'Dice': 0.6778754298815437,
   'FisherExact': 0.0,
   'Gsq': 12041.994806908666,
   'MI': 9.41167469710956,
   'RawCount': 887,
   'Xsq': 603675.038957376}),
 ('柰何',
  {'DeltaP12': 0.056661809249506735,
   'DeltaP21': 0.9864357417273106,
   'Dice': 0.10718562874251497,
   'FisherExact': 0.0,
   'Gsq': 2110.3645096843,
   'MI': 8.535527535461881,
   'RawCount': 179,
   'Xsq': 66249.74702510444}),
 ('嗚呼',
  {'DeltaP12': 0.5684168332436602,
   'DeltaP21': 0.9699560936315373,
   'Dice': 0.7168141592920354,
   'FisherExact': 0.0,
   'Gsq': 2772.224707107538,
   'MI': 11.978137468386173,
   'RawCount': 162,
   'Xsq': 653497.5945430023}),
 ('奈何',
  {'DeltaP12': 0.12217026433374524,
   'DeltaP21': 0.9391230952265434,
   'Dice': 0.21630708882039787,
   'FisherExact': 0.0,
   'Gsq': 4442.425062898784,
   'MI': 8.464534583043465,
   'RawCount': 386,
   'Xsq': 135991.8936699005}),
 ('阼階',
  {'DeltaP12': 0.2686904721001

### Node-Collocate Association

In [8]:
cql = """
"孔" "子"
"""
collo = c.collocates(cql, left=3, right=3, subcorp_idx=0, sort_by="Xsq", alpha=0)
collo[:5]

[('曰',
  {'DeltaP12': 0.5508984202190699,
   'DeltaP21': 0.017436408382199422,
   'Dice': 0.03412938870076635,
   'FisherExact': 0.0,
   'Gsq': 2486.7011528898893,
   'MI': 5.5855958675486095,
   'RawCount': 383,
   'Xsq': 17849.561805227277}),
 ('愀',
  {'DeltaP12': 0.008807342620906026,
   'DeltaP21': 0.4996367483650986,
   'Dice': 0.017316017316017316,
   'FisherExact': 2.1855507492354285e-18,
   'Gsq': 78.36068664097104,
   'MI': 10.41398510902232,
   'RawCount': 6,
   'Xsq': 8177.080337219988}),
 ('孔',
  {'DeltaP12': 0.06707828763041274,
   'DeltaP21': 0.04976704015525816,
   'Dice': 0.05753595997498437,
   'FisherExact': 1.5730409497803884e-81,
   'Gsq': 366.5766447587887,
   'MI': 7.095196721665525,
   'RawCount': 46,
   'Xsq': 6203.299925288661}),
 ('矙',
  {'DeltaP12': 0.002936857562408223,
   'DeltaP21': 0.999634597729232,
   'Dice': 0.005856515373352855,
   'FisherExact': 1.341091030070595e-07,
   'Gsq': 31.652163710326512,
   'MI': 11.41398510902232,
   'RawCount': 2,
   'Xsq