In [1]:
import json
from cilin import Cilin
from CompoTree import Radicals

C = Cilin(trad=True)
radicals = Radicals.load()

with open('radical_semantic_tag.json', encoding='utf-8') as f:
    rad_sem = json.load(f)

In [62]:
from itertools import product

def sem_feats(word):
    rads = [ radicals.query(c)[0] for c in word ]
    return [ rad_sem.get(r, ["NULL"]) for r in rads ]

def feat_comb(feats):
    if len(feats) > 2: 
        raise Exception('Accepts 2 features only')
    if len(feats) == 1: return feats[0]
    f1, f2 = feats
    return [f"{x}_{y}" for x, y in product(f1, f2)]

feat_comb(sem_feats('屍體'))
sem_feats("屍")

[['住宿', '性質']]

In [27]:
documents = { k:list(v) for k, v in C.category_split(level=5).items() }
# documents

In [68]:
from itertools import chain

sem_unigrams = set()
sem_bigrams = set()
unigram_sem = dict()
bigram_sem = dict()
for word in chain.from_iterable(documents.values()):
    if len(word) == 2:
        sem = feat_comb(sem_feats(word))
        for fc in sem:
            sem_bigrams.add(fc)
            bigram_sem.setdefault(word, set()).add(fc)
    for ch in word:
        sem = sem_feats(ch)[0]
        for fc in sem:
            sem_unigrams.add(fc)
            unigram_sem.setdefault(ch, set()).add(fc)

In [101]:
import numpy as np
from collections import Counter
from scipy.sparse import csc_matrix

id2doc = list(documents.keys())
doc2id = {doc:i for i, doc in enumerate(id2doc)}

id2unigramSem = list(sem_unigrams)
unigramSem2id = {v:i for i, v in enumerate(id2unigramSem)}

id2bigramSem = list(sem_bigrams)
bigramSem2id = {v:i for i, v in enumerate(id2bigramSem)}

# csr_bigram

# Unigram document-term matrix
data, row, col = [], [], []
for row_idx, doc in enumerate(id2doc):
    semTags = [ tag for ch in chain.from_iterable(documents[doc]) for tag in unigram_sem[ch] ]
    for tag, fq in Counter(semTags).items():
        col_idx = unigramSem2id[tag]
        data.append(fq)
        row.append(row_idx)
        col.append(col_idx)
csc_unigram = csc_matrix((data, (row, col)), shape=(len(id2doc), len(id2unigramSem)))

# Bigram document-term matrix
data, row, col = [], [], []
for row_idx, doc in enumerate(id2doc):
    semTags = [ tag for word in documents[doc] if len(word) == 2 for tag in bigram_sem[word] ]
    for tag, fq in Counter(semTags).items():
        col_idx = bigramSem2id[tag]
        data.append(fq)
        row.append(row_idx)
        col.append(col_idx)
csc_bigram = csc_matrix((data, (row, col)), shape=(len(id2doc), len(id2bigramSem)))

In [102]:
from gensim.models import LdaModel
from gensim.matutils import Sparse2Corpus

# lda = LdaModel(Sparse2Corpus(csc_unigram, documents_columns=False), num_topics=12, 
#                id2word={k:v for k, v in enumerate(id2unigramSem)}, random_state=100)
lda = LdaModel(Sparse2Corpus(csc_bigram, documents_columns=False), num_topics=12, 
               id2word={k:v for k, v in enumerate(id2bigramSem)}, random_state=100)

In [114]:
from collections import Counter

def encode_doc(doc, unigram=True):
    data = []
    if unigram:
        semTags = [ tag for ch in chain.from_iterable(documents[doc]) for tag in unigram_sem[ch] ]
        for tag, fq in Counter(semTags).items():
            col_idx = unigramSem2id[tag]
            data.append( (col_idx, fq) )
    else:
        semTags = [ tag for word in documents[doc] if len(word) == 2 for tag in bigram_sem[word] ]
        for tag, fq in Counter(semTags).items():
            col_idx = bigramSem2id[tag]
            data.append( (col_idx, fq) )
    return data


print(Counter(chain.from_iterable(documents['Aa01A01='])))
encode_doc('Aa01A01=', unigram=True)

Counter({'人': 5, '士': 2, '選': 1, '氏': 1, '物': 1})


[(14, 7), (15, 1), (9, 1), (20, 1)]

In [115]:
bow_bigram = encode_doc('Aa01A01=', unigram=False)
lda.get_document_topics(bow_bigram)

[(0, 0.016667048),
 (1, 0.016667055),
 (2, 0.016669767),
 (3, 0.01666705),
 (4, 0.016667048),
 (5, 0.6158041),
 (6, 0.016667046),
 (7, 0.01666772),
 (8, 0.21752171),
 (9, 0.016667338),
 (10, 0.016667059),
 (11, 0.016667048)]

In [117]:
for idx, prob in lda.get_topic_terms(5):
    print(id2bigramSem[idx], prob*100)

NULL_人 6.108827516436577
人_人 6.006703898310661
植物_人 5.035373941063881
NULL_野獸 4.135981202125549
人體頭部_人體精神 4.066850244998932
無生命_人 3.355614095926285
NULL_家畜 3.150147572159767
穿著器物_人 3.046010620892048
人體頭部_生活器物 2.5640178471803665
生命性質_生活器物 2.490675263106823


In [13]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(['A', 'A', 'B', 'B'], [1, 1, 0, 0])