<a href="https://colab.research.google.com/github/liao961120/hctk/blob/main/docs_source/nb/stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown https://github.com/liao961120/hctk/raw/main/test/data.zip
!unzip -q data.zip
!pip install -qU hctk

Downloading...
From: https://github.com/liao961120/hctk/raw/main/test/data.zip
To: /content/data.zip
100% 11.5M/11.5M [00:00<00:00, 70.2MB/s]
[K     |████████████████████████████████| 467 kB 7.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 36.0 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 596 kB 32.8 MB/s 
[K     |████████████████████████████████| 256 kB 72.2 MB/s 
[?25h  Building wheel for gdown (PEP 517) ... [?25l[?25hdone


## 1 Dispersion

In [2]:
from pprint import pprint
from hctk import PlainTextReader, Dispersion

dp = Dispersion(PlainTextReader("data").corpus)

Indexing corpus for text retrival...


  0%|          | 0/5 [00:00<?, ?it/s]

Indexing corpus for concordance search...


  0%|          | 0/5 [00:00<?, ?it/s]

### 1.1 Dispersion Measures of Characters

In [3]:
for ch in '之也水火':
    print(ch)
    pprint(dp.char_dispersion(char=ch))

之
{'DP': 0.20221115044151497,
 'DPnorm': 0.2022113683260698,
 'JuillandD': 0.9852586807096999,
 'KLdivergence': 0.19438584134276882,
 'Range': 1845,
 'RosengrenS': 0.9263264076180923}
也
{'DP': 0.35839650852771193,
 'DPnorm': 0.35839689470356595,
 'JuillandD': 0.9762985043690171,
 'KLdivergence': 0.5790626716815067,
 'Range': 1657,
 'RosengrenS': 0.8017462266655788}
水
{'DP': 0.4248753877868117,
 'DPnorm': 0.4248758455943289,
 'JuillandD': 0.9419817581882376,
 'KLdivergence': 1.2093306326342055,
 'Range': 1032,
 'RosengrenS': 0.6599676413503003}
火
{'DP': 0.5258856218700372,
 'DPnorm': 0.5258861885171008,
 'JuillandD': 0.9082169168517871,
 'KLdivergence': 1.5125799798374882,
 'Range': 630,
 'RosengrenS': 0.5227475458444708}


### 1.2 Dispersion Measures of Complex Forms (defined by CQL)

In [4]:
import pandas as pd
from hctk import Concordancer

c = Concordancer(PlainTextReader("data").corpus)
cql = """
[compo="氵" & idc="horz2" & pos="0"] [compo="氵" & idc="horz2" & pos="0"]
""".strip()
results = list(c.cql_search(cql, left=5, right=5))

print('Num of results:', len(results))
for r in results[:5]: print(r)

Indexing corpus for text retrival...


  0%|          | 0/5 [00:00<?, ?it/s]

Indexing corpus for concordance search...


  0%|          | 0/5 [00:00<?, ?it/s]

Num of results: 9782
<Concord 其中軍銜枚{潛涉}，不鼓不譟>
<Concord 也？《春秋{潛潭}巴》曰：「>
<Concord 舟之魚不居{潛澤}，度量之士>
<Concord 色親也。』{潛潭}巴曰：『虹>
<Concord 之厚德也。{潛潭}巴曰：『有>


In [5]:
# Compute separate dispersion measures for each subcorpus (time-sliced)
df = []
for i in range(dp.num_of_subcorp):
    stats, data = dp.pattern_dispersion(data=results, subcorp_idx=i, return_raw=True)
    stats['time'] = i
    stats['freq'] = data['f']
    stats['range (%)'] = stats['Range'] / data['n']
    stats['num_of_texts'] = data['n']
    stats['corpus_size'] = data['corpus_size']
    df.append(stats)

pd.DataFrame(df)

Unnamed: 0,Range,DP,DPnorm,KLdivergence,JuillandD,RosengrenS,time,freq,range (%),num_of_texts,corpus_size
0,346,0.464641,0.464645,1.18593,0.903298,0.60427,0,2116,0.472678,732,1858228
1,621,0.276959,0.27696,0.533721,0.946652,0.793915,1,4483,0.591992,1049,3938310
2,130,0.18177,0.181792,0.167024,0.927338,0.938771,2,2495,0.833333,156,2097273
3,2,0.127389,0.235377,0.04838,0.737937,0.983011,3,688,1.0,2,458738
4,0,0.0,0.0,0.0,,,4,0,0.0,5,50


## 2 Hanzi Component

In [6]:
from hctk import PlainTextReader
from hctk.compoAnalysis import CompoAnalysis

reader = PlainTextReader("data/", auto_load=False)
c2 = CompoAnalysis(reader)

### 2.1 Frequency Distribution

In [7]:
# Hanzi
c2.freq_distr(tp="chr", subcorp_idx=0).most_common(10)

[('，', 178802),
 ('。', 83819),
 ('之', 64665),
 ('不', 37264),
 ('也', 32634),
 ('而', 32035),
 ('以', 27556),
 ('其', 25931),
 ('者', 23304),
 ('曰', 21763)]

In [8]:
# Shape of hanzi (IDC)
c2.freq_distr(tp="idc", subcorp_idx=0)

Counter({'': 256312,
         'noChrData': 351898,
         '⿰': 438079,
         '⿱': 561453,
         '⿲': 6242,
         '⿳': 14381,
         '⿴': 14409,
         '⿵': 25234,
         '⿶': 7275,
         '⿷': 1641,
         '⿸': 91226,
         '⿹': 25384,
         '⿺': 26847,
         '⿻': 37847})

In [9]:
# Radical of hanzi
c2.freq_distr(tp="rad").most_common(10)

[('noChrData', 1517367),
 ('人', 422649),
 ('一', 281707),
 ('丿', 252723),
 ('口', 249345),
 ('火', 165875),
 ('言', 157936),
 ('水', 155632),
 ('八', 151539),
 ('心', 145208)]

In [10]:
# Hanzi with a certain radical
c2.freq_distr(tp=None, radical="广").most_common(10)

[('度', 4757),
 ('廣', 4050),
 ('廟', 3067),
 ('府', 3064),
 ('廢', 2542),
 ('庶', 2281),
 ('廉', 1594),
 ('康', 1570),
 ('序', 1213),
 ('庭', 1155)]

In [11]:
# Hanzi with a certain component
c2.freq_distr(tp=None, compo="水", idc="vert2", pos=-1)

Counter({'氶': 1,
         '汞': 15,
         '沓': 89,
         '泉': 1349,
         '泵': 3,
         '淼': 4,
         '滎': 344,
         '漀': 1,
         '漐': 9,
         '漿': 153,
         '澩': 3,
         '灓': 5})

### 2.2 Productivity

- Realized Productivity: $V(C, N)$
- Expanding Productivity: $\frac{V(1, C, N)}{V(1, N)}$
- Potential Productivity: $\frac{V(1, C, N)}{N(C)}$

In [12]:
# Productivity of a radical
c2.productivity(radical="广", subcorp_idx=0)

{'N': 1858228,
 'NC': 5897,
 'V1': 2083,
 'V1C': 9,
 'productivity': {'expanding': 0.0043206913106096975,
  'potential': 0.001526199762591148,
  'realized': 62}}

In [13]:
# Productivity of a component
c2.productivity(compo="虫", idc="horz2", pos=0, subcorp_idx=0)

{'N': 1858228,
 'NC': 1027,
 'V1': 2083,
 'V1C': 72,
 'productivity': {'expanding': 0.03456553048487758,
  'potential': 0.07010710808179163,
  'realized': 178}}

In [14]:
# Productivity of Hanzi shapes (IDCs)
import pandas as pd
from CompoTree import IDC

df = []
for idc in IDC:   
    p = c2.productivity(idc=idc.name, subcorp_idx=0)
    df.append({
        'name': idc.name, 
        'shape': idc.value, 
        **p['productivity'],
        'V1C': p['V1C'],
        'V1': p['V1'],
        'NC': p['NC'],
        'N': p['N'],
    })

df = pd.DataFrame(df)
df

Unnamed: 0,name,shape,realized,expanding,potential,V1C,V1,NC,N
0,horz2,⿰,5580,0.709073,0.003372,1477,2083,438079,1858228
1,vert2,⿱,2110,0.223236,0.000828,465,2083,561453,1858228
2,horz3,⿲,37,0.0024,0.000801,5,2083,6242,1858228
3,vert3,⿳,85,0.006721,0.000974,14,2083,14381,1858228
4,encl,⿴,27,0.00144,0.000208,3,2083,14409,1858228
5,surN,⿵,87,0.005761,0.000476,12,2083,25234,1858228
6,surU,⿶,6,0.0,0.0,0,2083,7275,1858228
7,curC,⿷,20,0.00192,0.002438,4,2083,1641,1858228
8,surT,⿸,342,0.026884,0.000614,56,2083,91226,1858228
9,sur7,⿹,51,0.00288,0.000236,6,2083,25384,1858228


## 3 Ngram Frequency

In [15]:
from hctk import PlainTextReader, Concordancer

c = Concordancer(PlainTextReader("data/").corpus)

Indexing corpus for text retrival...


  0%|          | 0/5 [00:00<?, ?it/s]

Indexing corpus for concordance search...


  0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
# Bigram frequency
c.freq_distr_ngrams(n=2, subcorp_idx=0).most_common(10)

Counting 2-grams...


  0%|          | 0/5 [00:02<?, ?it/s]

[('而不', 3913),
 ('天下', 3661),
 ('不可', 2985),
 ('之所', 2723),
 ('子曰', 2581),
 ('人之', 2317),
 ('以為', 2231),
 ('所以', 2023),
 ('不能', 1934),
 ('可以', 1667)]

In [17]:
# Trigram frequency
c.freq_distr_ngrams(n=3, subcorp_idx=0).most_common(10)

Counting 3-grams...


  0%|          | 0/5 [00:00<?, ?it/s]

[('天下之', 946),
 ('歧伯曰', 766),
 ('之所以', 605),
 ('不可以', 580),
 ('子對曰', 443),
 ('黃帝曰', 403),
 ('此之謂', 350),
 ('子墨子', 343),
 ('孔子曰', 302),
 ('不可不', 298)]

## 4 Collocation

### 4.1 Bigram Association

In [18]:
bi_asso = c.bigram_associations(subcorp_idx=0, sort_by="DeltaP21")
[x for x in bi_asso if x[1].get('RawCount', 0) > 100][:3]

[('歧伯',
  {'DeltaP12': 0.515988379165576,
   'DeltaP21': 0.9870480872578454,
   'Dice': 0.6778754298815437,
   'FisherExact': 0.0,
   'Gsq': 12041.994806908666,
   'MI': 9.41167469710956,
   'RawCount': 887,
   'Xsq': 603675.038957376}),
 ('柰何',
  {'DeltaP12': 0.056661809249506735,
   'DeltaP21': 0.9864357417273106,
   'Dice': 0.10718562874251497,
   'FisherExact': 0.0,
   'Gsq': 2110.3645096843,
   'MI': 8.535527535461881,
   'RawCount': 179,
   'Xsq': 66249.74702510444}),
 ('嗚呼',
  {'DeltaP12': 0.5684168332436602,
   'DeltaP21': 0.9699560936315373,
   'Dice': 0.7168141592920354,
   'FisherExact': 0.0,
   'Gsq': 2772.224707107538,
   'MI': 11.978137468386173,
   'RawCount': 162,
   'Xsq': 653497.5945430023})]

### 4.2 Node-Collocate Association

In [19]:
cql = """
"孔" "子"
"""
collo = c.collocates(cql, left=3, right=3, subcorp_idx=0, sort_by="Xsq", alpha=0)
collo[:5]

[('曰',
  {'DeltaP12': 0.5508984202190699,
   'DeltaP21': 0.017436408382199422,
   'Dice': 0.03412938870076635,
   'FisherExact': 0.0,
   'Gsq': 2486.7011528898893,
   'MI': 5.5855958675486095,
   'RawCount': 383,
   'Xsq': 17849.561805227277}),
 ('愀',
  {'DeltaP12': 0.008807342620906026,
   'DeltaP21': 0.4996367483650986,
   'Dice': 0.017316017316017316,
   'FisherExact': 2.1855507492354285e-18,
   'Gsq': 78.36068664097104,
   'MI': 10.41398510902232,
   'RawCount': 6,
   'Xsq': 8177.080337219988}),
 ('孔',
  {'DeltaP12': 0.06707828763041274,
   'DeltaP21': 0.04976704015525816,
   'Dice': 0.05753595997498437,
   'FisherExact': 1.5730409497803884e-81,
   'Gsq': 366.5766447587887,
   'MI': 7.095196721665525,
   'RawCount': 46,
   'Xsq': 6203.299925288661}),
 ('矙',
  {'DeltaP12': 0.002936857562408223,
   'DeltaP21': 0.999634597729232,
   'Dice': 0.005856515373352855,
   'FisherExact': 1.341091030070595e-07,
   'Gsq': 31.652163710326512,
   'MI': 11.41398510902232,
   'RawCount': 2,
   'Xsq