In [1]:
import json
import numpy as np
from cilin import Cilin
from CompoTree import Radicals
from collections import Counter
from itertools import product, chain
from scipy.sparse import csc_matrix
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from gensim.models import LdaModel
from gensim.matutils import Sparse2Corpus

C = Cilin(trad=True)
radicals = Radicals.load()
documents = { k:list(v) for k, v in C.category_split(level=4).items() }

with open('radical_semantic_tag.json', encoding='utf-8') as f:
    rad_sem = json.load(f)

In [2]:
def sem_feats(word):
    rads = [ radicals.query(c)[0] for c in word ]
    return [ rad_sem.get(r, ["NULL"]) for r in rads ]

def feat_comb(feats):
    if len(feats) > 2: 
        raise Exception('Accepts 2 features only')
    if len(feats) == 1: return feats[0]
    f1, f2 = feats
    return [f"{x}_{y}" for x, y in product(f1, f2)]

feat_comb(sem_feats('屍體'))
sem_feats("屍")

[['住宿', '性質']]

In [4]:
sem_unigrams = set()
sem_bigrams = set()
unigram_sem = dict()
bigram_sem = dict()
for word in chain.from_iterable(documents.values()):
    if len(word) == 2:
        sem = feat_comb(sem_feats(word))
        for fc in sem:
            sem_bigrams.add(fc)
            bigram_sem.setdefault(word, set()).add(fc)
    for ch in word:
        sem = sem_feats(ch)[0]
        for fc in sem:
            sem_unigrams.add(fc)
            unigram_sem.setdefault(ch, set()).add(fc)

In [5]:
id2doc = list(documents.keys())
doc2id = {doc:i for i, doc in enumerate(id2doc)}

id2unigramSem = list(sem_unigrams)
unigramSem2id = {v:i for i, v in enumerate(id2unigramSem)}

id2bigramSem = list(sem_bigrams)
bigramSem2id = {v:i for i, v in enumerate(id2bigramSem)}

# Unigram document-term matrix
data, row, col = [], [], []
for row_idx, doc in enumerate(id2doc):
    semTags = [ tag for ch in chain.from_iterable(documents[doc]) for tag in unigram_sem[ch] ]
    for tag, fq in Counter(semTags).items():
        col_idx = unigramSem2id[tag]
        data.append(fq)
        row.append(row_idx)
        col.append(col_idx)
csc_unigram = csc_matrix((data, (row, col)), shape=(len(id2doc), len(id2unigramSem)))

# Bigram document-term matrix
data, row, col = [], [], []
for row_idx, doc in enumerate(id2doc):
    semTags = [ tag for word in documents[doc] if len(word) == 2 for tag in bigram_sem[word] ]
    for tag, fq in Counter(semTags).items():
        col_idx = bigramSem2id[tag]
        data.append(fq)
        row.append(row_idx)
        col.append(col_idx)
csc_bigram = csc_matrix((data, (row, col)), shape=(len(id2doc), len(id2bigramSem)))

In [6]:
# lda = LdaModel(Sparse2Corpus(csc_unigram, documents_columns=False), num_topics=12, 
#                id2word={k:v for k, v in enumerate(id2unigramSem)}, random_state=100)
lda = LdaModel(Sparse2Corpus(csc_bigram, documents_columns=False), num_topics=12, 
               id2word={k:v for k, v in enumerate(id2bigramSem)}, random_state=100)

In [7]:
def encode_doc(doc, unigram):
    data = []
    if unigram:
        semTags = [ tag for ch in chain.from_iterable(documents[doc]) for tag in unigram_sem[ch] ]
        for tag, fq in Counter(semTags).items():
            col_idx = unigramSem2id[tag]
            data.append( (col_idx, fq) )
    else:
        semTags = [ tag for word in documents[doc] if len(word) == 2 for tag in bigram_sem[word] ]
        for tag, fq in Counter(semTags).items():
            col_idx = bigramSem2id[tag]
            data.append( (col_idx, fq) )
    return data

def get_document_topic_vec(doc, lda_model, unigram):
    bow = encode_doc(doc, unigram=unigram)
    tp_distr = dict(lda_model.get_document_topics(bow, minimum_probability=1e-16))
    return [ tp_distr.get(i, 0.0) for i in range(lda_model.num_topics)] 


print(Counter(chain.from_iterable(documents['Aa01A'])))
encode_doc('Aa01A', unigram=True)

Counter({'人': 17, '子': 8, '崽': 4, '員': 2, '口': 2, '夫': 2, '蛋': 2, '貨': 2, '手': 2, '者': 2, '勞': 2, '力': 2, '小': 2, '類': 2, '每': 2, '士': 2, '生': 2, '家': 1, '丁': 1, '主': 1, '各': 1, '鼠': 1, '輩': 1, '雜': 1, '種': 1, '分': 1, '匹': 1, '混': 1, '豎': 1, '匠': 1, '翁': 1, '工': 1, '作': 1, '客': 1, '動': 1, '廝': 1, '氏': 1, '王': 1, '八': 1, '位': 1, '漢': 1, '徒': 1, '色': 1, '個': 1, '狗': 1, '全': 1, '東': 1, '西': 1, '畜': 1, '此': 1, '食': 1, '指': 1, '選': 1, '鬼': 1, '該': 1, '傢': 1, '伙': 1, '兔': 1, '物': 1, '棍': 1})


[(7, 3),
 (3, 26),
 (4, 8),
 (18, 7),
 (14, 12),
 (13, 5),
 (21, 3),
 (10, 6),
 (6, 1),
 (16, 11),
 (5, 7),
 (8, 1),
 (11, 11),
 (19, 1),
 (12, 2),
 (17, 4),
 (9, 1)]

In [20]:
X = []
for row_idx, doc1_id in enumerate(id2doc):
    p = get_document_topic_vec(doc1_id, lda, unigram=False)
    X.append(p)
X = np.array(X, dtype=float)
dist_mat = pdist(X, "jensenshannon")
dist_mat = squareform(dist_mat)
dist_mat.shape

for row_idx, col_idx in np.argwhere(np.isnan(dist_mat)):
    dist_mat[row_idx][col_idx] = 0.0
dist_mat.shape

(4223, 4223)

In [22]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=12, affinity='precomputed', linkage='complete').fit(dist_mat)

In [28]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
from sklearn.metrics.cluster import adjusted_rand_score

true_labs = [ x[0] for x in id2doc ]
cluster_labs = clustering.labels_

adjusted_rand_score(true_labs, cluster_labs)

0.028566409365600492