In [1]:
import json
from cilin import Cilin
from CompoTree import Radicals

C = Cilin(trad=True)
radicals = Radicals.load()

with open('radical_semantic_tag.json', encoding='utf-8') as f:
    rad_sem = json.load(f)

In [62]:
from itertools import product

def sem_feats(word):
    rads = [ radicals.query(c)[0] for c in word ]
    return [ rad_sem.get(r, ["NULL"]) for r in rads ]

def feat_comb(feats):
    if len(feats) > 2: 
        raise Exception('Accepts 2 features only')
    if len(feats) == 1: return feats[0]
    f1, f2 = feats
    return [f"{x}_{y}" for x, y in product(f1, f2)]

feat_comb(sem_feats('屍體'))
sem_feats("屍")

[['住宿', '性質']]

In [27]:
documents = { k:list(v) for k, v in C.category_split(level=5).items() }
# documents

In [68]:
from itertools import chain

sem_unigrams = set()
sem_bigrams = set()
unigram_sem = dict()
bigram_sem = dict()
for word in chain.from_iterable(documents.values()):
    if len(word) == 2:
        sem = feat_comb(sem_feats(word))
        for fc in sem:
            sem_bigrams.add(fc)
            bigram_sem.setdefault(word, set()).add(fc)
    for ch in word:
        sem = sem_feats(ch)[0]
        for fc in sem:
            sem_unigrams.add(fc)
            unigram_sem.setdefault(ch, set()).add(fc)

In [86]:
import numpy as np
from collections import Counter
from scipy.sparse import csc_matrix

id2doc = list(documents.keys())
doc2id = {doc:i for i, doc in enumerate(id2doc)}

id2unigramSem = list(sem_unigrams)
unigramSem2id = {v:i for i, v in enumerate(id2unigramSem)}

id2bigramSem = list(sem_bigrams)
bigramSem2id = {v:i for i, v in enumerate(id2bigramSem)}

# csr_bigram

# Unigram document-term matrix
data, row, col = [], [], []
for row_idx, doc in enumerate(id2doc):
    semTags = [ tag for ch in chain.from_iterable(documents[doc]) for tag in unigram_sem[ch] ]
    for tag, fq in Counter(semTags).items():
        col_idx = unigramSem2id[tag]
        data.append(fq)
        row.append(row_idx)
        col.append(col_idx)
csc_unigram = csc_matrix((data, (row, col)), shape=(len(id2doc), len(id2unigramSem)))

In [98]:
from gensim.models import LdaModel
from gensim.matutils import Sparse2Corpus

lda = LdaModel(Sparse2Corpus(csc_unigram, documents_columns=False), num_topics=10, 
               id2word={k:v for k, v in enumerate(id2unigramSem)}, random_state=100)

In [99]:
lda.get_topic_terms(0)

[(12, 0.42485783),
 (20, 0.29061306),
 (2, 0.040224746),
 (15, 0.039364696),
 (11, 0.030300116),
 (3, 0.02780929),
 (1, 0.02095821),
 (6, 0.020134788),
 (14, 0.019468024),
 (13, 0.019131176)]

In [13]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(['A', 'A', 'B', 'B'], [1, 1, 0, 0])