In [1]:
import json
from cilin import Cilin
from CompoTree import Radicals

C = Cilin(trad=True)
radicals = Radicals.load()

with open('radical_semantic_tag.json', encoding='utf-8') as f:
    rad_sem = json.load(f)

In [26]:
from itertools import product

def sem_feats(word):
    rads = [ radicals.query(c)[0] for c in word ]
    return [ rad_sem.get(r, ["NULL"]) for r in rads ]

def feat_comb(feats):
    if len(feats) > 2: 
        raise Exception('Accepts 2 features only')
    if len(feats) == 1: return feats[0]
    f1, f2 = feats
    return [f"{x}_{y}" for x, y in product(f1, f2)]

feat_comb(sem_feats('屍體'))

['住宿_人體內部', '性質_人體內部']

In [27]:
documents = { k:list(v) for k, v in C.category_split(level=5).items() }
# documents

In [52]:
from itertools import chain

sem_unigrams = set()
sem_bigrams = set()
for word in chain.from_iterable(documents.values()):
    if len(word) == 2:
        sem = feat_comb(sem_feats(word))
        sem_bigrams.update(set(sem))
    for ch in word:
        sem = sem_feats(ch)[0][0]
        sem_unigrams.add(sem)

In [56]:
import numpy as np
from scipy.sparse import csr_matrix

id2doc = list(documents.keys())
doc2id = {doc:i for i, doc in enumerate(id2doc)}

id2unigram = list(sem_unigrams)
unigram2id = {v:i for i, v in enumerate(id2unigram)}

id2bigram = list(sem_bigrams)
bigram2id = {v:i for i, v in enumerate(id2bigram)}

# csr_bigram

# Unigram document-term matrix
for doc_idx, doc in enumerate(id2doc):
    terms = documents[doc]
csr_unigram = csr_matrix((data, (row, col)), shape=(len(id2doc), len(id2unigram)))

{'顏色_生命性質': 0,
 '廚房器物_無生命': 1,
 '家畜_城鄉': 2,
 '顏色_城鄉': 3,
 '穿著器物_人倫關係': 4,
 '家畜_人體四肢': 5,
 '無生命_人': 6,
 '人_人倫關係': 7,
 '人體頭部_家畜': 8,
 '住宿_住宿': 9,
 '生活器物_人體精神': 10,
 '生活器物_人體頭部': 11,
 'NULL_城鄉': 12,
 'NULL_人體性質': 13,
 '人_城鄉': 14,
 'NULL_人體精神': 15,
 '植物_顏色': 16,
 '穿著器物_人體精神': 17,
 '禮樂_性質': 18,
 '廚房器物_人': 19,
 '人體頭部_人': 20,
 '禮樂_生活器物': 21,
 '人體內部_NULL': 22,
 '廚房器物_城鄉': 23,
 '顏色_人': 24,
 '野獸_人體四肢': 25,
 '生命性質_人倫關係': 26,
 '生活器物_性質': 27,
 '人_穿著器物': 28,
 '人體性質_廚房器物': 29,
 '城鄉_人體性質': 30,
 '人倫關係_人體內部': 31,
 '生命性質_人體四肢': 32,
 '家畜_生命性質': 33,
 '人倫關係_野獸': 34,
 '生命性質_廚房器物': 35,
 '人體性質_武器': 36,
 '顏色_NULL': 37,
 '人體性質_顏色': 38,
 '生命性質_家畜': 39,
 '廚房器物_動物軀體': 40,
 '穿著器物_人體頭部': 41,
 '人體四肢_家畜': 42,
 '野獸_城鄉': 43,
 '住宿_禮樂': 44,
 '城鄉_NULL': 45,
 '人體四肢_植物': 46,
 '人_廚房器物': 47,
 '顏色_武器': 48,
 '禮樂_禮樂': 49,
 '生命性質_人體精神': 50,
 '家畜_人體性質': 51,
 '生命性質_人體頭部': 52,
 '武器_家畜': 53,
 '性質_人體精神': 54,
 '武器_人體性質': 55,
 '人倫關係_顏色': 56,
 '人體頭部_穿著器物': 57,
 '住宿_家畜': 58,
 '人體頭部_性質': 59,
 '武器_植物': 60,
 '家畜_野獸': 61,
 '生命性質_顏色': 62,
 '無生命_

In [13]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(['A', 'A', 'B', 'B'], [1, 1, 0, 0])