In [1]:
import math

import numpy
import pandas

import ipywidgets as widgets
from IPython.display import display, clear_output

import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import pythainlp

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

from data_tokenizer import load_corpus

from model.upgrade_sdc import UpgradeSDC
from model.sdc import SDC

### Load Data

In [2]:
file_name = '‡∏ú‡∏π‡πâ‡∏ö‡∏£‡∏¥‡πÇ‡∏†‡∏Ñ - TrueCoffee'

corpus, labels = load_corpus('../data/' + file_name + '.txt')

len_corpus = len(corpus)
print('Total documents', len_corpus)

clusters = list(set(labels))
print(len(clusters), 'clusters')

f = open('../data/tokenized/tokenized_' + file_name + '.txt')
tokenized_corpus = eval(f.read())
f.close()

Total documents 350
1 clusters


### Preprocess Corpus

#### Remove Words

In [3]:
dictionary = Dictionary(tokenized_corpus)
print('origin:', len(dictionary), 'words')

dictionary.filter_extremes(no_below=2, no_above=0.7, keep_n=len(dictionary))
print('filter frequent words:', len(dictionary), 'words')

letter_words = [id for id in range(len(dictionary)) if len(dictionary[id]) <= 1] 
dictionary.filter_tokens(bad_ids=letter_words)
print('filter letter words:', len(dictionary), 'words')

stopwords = pythainlp.corpus.stopwords.words('thai')
stopwords.extend(['‡∏ô‡∏µ‡πâ'])
dictionary.add_documents([stopwords])
stopwords = [dictionary.token2id[word] for word in stopwords]
dictionary.filter_tokens(bad_ids=stopwords)
print('filter stop words:', len(dictionary), 'words')

origin: 948 words
filter frequent words: 370 words
filter letter words: 368 words
filter stop words: 219 words


In [4]:
idx_corpus = [dictionary.doc2idx(doc) for doc in tokenized_corpus]

temp_corpus = []
for doc in idx_corpus:
    temp_corpus.append([dictionary[id] for id in doc if id >= 0])
idx_corpus = temp_corpus

#### Dimension Reduction

In [5]:
average_doc_size = 0
for doc in idx_corpus:
    average_doc_size += len(doc)
average_doc_size /= len(idx_corpus)
average_doc_size = math.ceil(average_doc_size)

df = dictionary.dfs
filtered_corpus = []
for doc in idx_corpus:
    new_doc = [(word, df[dictionary.token2id[word]]) for word in doc]
    new_doc.sort(reverse=True, key=lambda x: x[1])
    new_doc = new_doc[:average_doc_size]
    filtered_corpus.append([word for word, df in new_doc])

#### Doc2Vec

In [6]:
tagged_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(idx_corpus)]
model = Doc2Vec(tagged_corpus, vector_size=average_doc_size, window=4, min_count=2, epochs=100)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

paragraph_vectors = [model.infer_vector(doc) for doc in idx_corpus]
paragraph_vectors = pandas.DataFrame(paragraph_vectors, dtype=float)

### Clustering

In [7]:
def get_onehot(corpus, weight):
    dictionary = Dictionary(corpus)
#     dictionary.filter_extremes(no_below=2, no_above=1, keep_n=len(dictionary))

    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
    if weight == 'normal':
        weight_corpus = bow_corpus
    elif weight == 'tfidf':
        tfidf = TfidfModel(bow_corpus, smartirs='ltc')
        weight_corpus = [tfidf[doc] for doc in bow_corpus]

    unique_words = [dictionary[id] for id in range(len(dictionary))]
    array = numpy.zeros((len(corpus), len(unique_words)), dtype=float)
    for i, doc in enumerate(weight_corpus):
        for id, score in doc:
            array[i, id] = score

        if weight == 'normal' and len(doc) != 0:
#             array[i] = numpy.divide(array[i], len(idx_corpus[i]))
            array[i] = numpy.divide(array[i], len(doc))
    
    return pandas.DataFrame(array, columns=unique_words, dtype=float)

In [8]:
def generate_result(predicted_labels, marks):
    result = pandas.DataFrame()
    result['comment'] = corpus
    result['tokenized_comment'] = idx_corpus
    result['label'] = labels
    result['predicted_label'] = predicted_labels
    if marks:
        result['marks'] = marks
    else:
        result['marks'] = -1
    return result

In [9]:
def eval_cluster(onehot_corpus, result):
    label_count = numpy.unique(result['predicted_label'])
    num_cluster = label_count[-1] + 1

    clusters = [[] for i in range(num_cluster)]
    corpus_centroid = []
    for i, label in result['predicted_label'].iteritems():
        clusters[label].append(numpy.array(onehot_corpus.iloc[i]))
        corpus_centroid.append(numpy.array(onehot_corpus.iloc[i]))
    corpus_centroid = numpy.mean(corpus_centroid, axis=0).reshape(1, -1)   

#     print('\tIntra cluster sim\tInter cluster sim\tIntra / Inter')
    compactness = 0
    centroids = []
    for i in range(num_cluster):
        size = len(clusters[i])
        if size != 0:
            centroid = numpy.mean(clusters[i], axis=0)
            centroids.append(centroid)
            centroid = centroid.reshape(1, -1)
            similarities = cosine_similarity(centroid, clusters[i])
            compactness += numpy.sum(similarities)

#             intra = numpy.sum(similarities) / size
#             inter = cosine_similarity(centroid, corpus_centroid)[0][0]
#             print(i, end='\t')
#             print(intra, end='\t')
#             print(inter, end='\t')
#             print(intra / inter)
    return compactness, centroids

In [10]:
min_samples = 7
eps = 0.32

onehot_corpus = get_onehot(idx_corpus, 'normal')
# onehot_corpus = get_onehot(filtered_corpus, 'tfidf')

In [11]:
max_compactness = 0
epoch = 15
for i in range(epoch):
    model = UpgradeSDC()
    _tpredicted_labels, marks = model.predict(onehot_corpus, min_samples, eps)

#     model = SDC()
#     _tpredicted_labels, marks = model.predict(onehot_corpus, min_samples, eps)
    
#     marks = None
    
#     model = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples).fit(onehot_corpus)
#     _tpredicted_labels = model.labels_ + 1

#     model = KMeans(n_clusters=7).fit(onehot_corpus)
#     _tpredicted_labels = model.labels_
    
    _tresult = generate_result(_tpredicted_labels, marks)
    compactness, _tcentroids = eval_cluster(onehot_corpus, _tresult)
    
    if compactness > max_compactness:
        max_compactness = compactness
        predicted_labels = _tpredicted_labels
        result = _tresult
        centroids = _tcentroids
        
print(max_compactness)

137.566296070018


In [31]:
centroids = None 
for i in range(1):
    model = UpgradeSDC()
    if centroids:
        centroids = centroids[1:]
    predicted_labels, marks = model.predict(onehot_corpus, min_samples, eps, seeds=centroids)
    
    result = generate_result(predicted_labels, marks)
    compactness, centroids = eval_cluster(onehot_corpus, result)
            
    print(compactness, numpy.unique(result['predicted_label'], return_counts=True)[1][0])

165.43197277018987 73


In [23]:
label_count = numpy.unique(result['predicted_label'], return_counts=True)
num_cluster = label_count[0][-1] + 1
print(label_count, '\n')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]), array([82, 46, 37, 44, 18, 21,  8, 15, 12, 21, 11, 14, 10, 11])) 



In [24]:
sims = cosine_similarity(centroids)
new_labels = [i for i in range(num_cluster)]
print(new_labels)
for i, row in reversed(list(enumerate(sims))):
    for j, value in reversed(list(enumerate(row[:i + 1]))):
        if i != j and value >= eps - eps / 20:
            print(i, j, value)
            base = min(new_labels[i], new_labels[j])
            new_labels[j] = base
            new_labels = [base if label == new_labels[i] else label for label in new_labels]
print(new_labels)

grouped_labels = numpy.zeros(len_corpus)
for i, label in enumerate(predicted_labels):
    grouped_labels[i] = new_labels[label]
new_result = generate_result(grouped_labels, None)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


### Result

In [25]:
class Widget:
    def __init__(self, result, column_name):
        self.result = result
        self.column_name = column_name
        
        label_count = numpy.unique(result['predicted_label'])
        self.widget = widgets.ToggleButtons(
            options=[int(num) for num in label_count],
            disabled=False,
            button_style='',
        )
        
        self.widget.observe(self.on_click, names='index')
        self.on_click({'new' : 0})
        
    def on_click(self, change):
        clear_output()
        display(self.widget)
        new = self.widget.options[change['new']]
        for index, value in self.result[self.result['predicted_label'] == new].iterrows():
            if value['marks'] == 0:
                print("@", end="")
            elif value['marks'] == 1:
                print("*", end="")
            print(index, value[self.column_name])

In [26]:
result.to_csv('../data/results/em/' + file_name + '.csv')

# result = pandas.read_csv('../data/results/2/' + file_name + '.csv')

count = 0
for index, value in result.iterrows():
    if value['marks'] == -1:
        count += 1
print(count)

83


In [27]:
w1 = Widget(new_result, 'comment')

ToggleButtons(options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), value=0)

1 ‡πÄ‡∏£‡∏≤‡∏á‡∏á ‡∏™‡∏±‡πà‡∏á‡∏Ñ‡∏≤‡∏õ‡∏π‡∏ä‡∏¥‡πÇ‡∏ô‡∏´‡∏ß‡∏≤‡∏ô‡∏ô‡πâ‡∏≠‡∏¢ ‡∏ó‡∏≥‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏Ñ‡πà‡∏∞ ‡∏ñ‡πâ‡∏≤‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‡∏≠‡∏¢‡∏≤‡∏Å‡πÑ‡∏î‡πâ‡∏´‡∏ß‡∏≤‡∏ô‡∏ô‡πâ‡∏≠‡∏¢‡∏ï‡πâ‡∏≠‡∏á‡∏™‡∏±‡πà‡∏á‡∏•‡∏≤‡πÄ‡∏ï‡πâ ‡∏Å‡∏π‡∏á‡∏á‡∏á‡∏á‡∏á‡∏á‡∏á
4 ‡∏ã‡∏∑‡πâ‡∏≠‡∏•‡∏¥‡∏Ç‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡πå UCL ‡∏°‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏ô‡∏∞ ‡∏à‡πà‡∏≤‡∏¢‡∏£‡∏≤‡∏¢‡πÄ‡∏î‡∏∑‡∏≠‡∏ô‡πÑ‡∏õ ‡πÅ‡∏ï‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏´‡πâ‡∏î‡∏π
12 ‡∏ú‡∏°‡∏ß‡πà‡∏≤ ‡∏ú‡∏°‡∏ä‡∏≠‡∏ö‡∏£‡∏™‡∏ä‡∏≤‡∏ï‡∏¥‡∏Ç‡∏≠‡∏á‡∏Å‡∏≤‡πÅ‡∏ü‡∏Ñ‡∏∏‡∏ì ‡∏´‡∏•‡∏≤‡∏¢‡πÜ‡πÅ‡∏Å‡πâ‡∏ß‡∏õ‡∏£‡∏∞‡∏ó‡∏±‡∏ö‡πÉ‡∏à‡∏Å‡∏ß‡πà‡∏≤ ‡πÅ‡∏ö‡∏£‡∏ô‡∏î‡πå‡∏ô‡∏≤‡∏á‡πÄ‡∏á‡∏∑‡∏≠‡∏Å‡πÅ‡∏ï‡πà‡∏ß‡πà‡∏≤ ‡πÅ‡∏û‡∏á‡πÑ‡∏õ‡∏´‡∏ß‡πà‡∏∞ ‡πÑ‡∏°‡πà‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏Å‡∏∞‡∏Ñ‡∏ô‡πÑ‡∏ó‡∏¢ (‡∏ñ‡∏∂‡∏á‡πÅ‡∏°‡πâ‡∏à‡∏∞‡πÉ‡∏ä‡πâ‡∏™‡πà‡∏ß‡∏ô‡∏•‡∏î ‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‡∏ó‡∏£‡∏π ‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡πâ‡∏ï‡∏≤‡∏°)‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï‡πÑ‡∏î‡πâ‡∏ß‡πà‡∏≤ ‡∏ô‡πâ‡∏≠‡∏¢‡∏™‡∏≤‡∏Ç‡∏≤ ‡∏ó‡∏µ‡πà‡∏Ñ‡∏ô‡∏à‡∏∞‡πÅ‡∏ô‡πà‡∏ô ‡∏´‡∏£‡∏∑‡∏≠‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡πà‡∏≠‡∏Ñ‡∏¥‡∏ß‡∏™‡∏±‡∏µ‡∏á‡∏Å‡∏≤‡πÅ‡∏ü ‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÅ‡∏ö‡∏£‡∏ô‡∏î‡πå‡∏ô

In [28]:
w2 = Widget(result, 'comment')

ToggleButtons(index=8, options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), value=8)

*18 ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏î‡∏∑‡πà‡∏°‡πÇ‡∏ö‡∏£‡∏≤‡∏ì ‡∏≠‡∏±‡∏î‡∏ô‡πâ‡∏≥‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏ô‡πà‡∏ô‡∏°‡∏≤‡∏Å ‡∏î‡∏π‡∏î 3-4 ‡∏ó‡∏µ‡∏´‡∏°‡∏î ‡∏ô‡πâ‡∏≥‡πÅ‡∏Ç‡πá‡∏á‡∏¢‡∏±‡∏á‡πÅ‡∏ô‡πà‡∏ô‡πÅ‡∏Å‡πâ‡∏ß‡∏≠‡∏¢‡∏π‡πà‡πÄ‡∏•‡∏¢
*29 ‡∏ã‡∏∑‡πâ‡∏≠‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏´‡∏≤‡∏ó‡∏µ‡πà‡∏ô‡∏±‡πà‡∏á‡∏•‡πâ‡∏ß‡∏ô‡πÜ ‡∏£‡∏™‡∏ä‡∏≤‡∏ï‡∏¥‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏û‡∏π‡∏î‡∏ñ‡∏∂‡∏á ‡∏õ‡∏•‡πà‡∏≠‡∏¢‡∏ô‡πâ‡∏≥‡πÅ‡∏Ç‡πá‡∏á‡∏•‡∏∞‡∏•‡∏≤‡∏¢‡∏ó‡∏∏‡∏Å‡∏ó‡∏µ5555
*65 ‡∏Ç‡∏≠‡∏á‡∏õ‡∏±‡πà‡∏ô ‡∏ï‡∏£‡∏∞‡∏Å‡∏π‡∏•‡πÄ‡∏ö‡∏≠‡∏£‡∏µ‡πà ‡πÄ‡∏õ‡∏£‡∏µ‡πâ‡∏¢‡∏ß‡πÅ‡∏•‡∏∞‡∏´‡∏ß‡∏≤‡∏ô‡∏°‡∏≤‡∏Å‡∏Å‡∏Å ‡∏£‡∏≠‡∏ô‡πâ‡∏≥‡πÅ‡∏Ç‡πá‡∏á‡∏•‡∏∞‡∏•‡∏≤‡∏¢‡∏Å‡πá‡πÑ‡∏°‡πà‡∏´‡∏≤‡∏¢‡πÄ‡∏õ‡∏£‡∏µ‡πâ‡∏¢‡∏ß‡∏ß‡∏ß
*77 ‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡πÑ‡∏°‡πà‡∏Ñ‡∏ß‡∏£‡∏¢‡∏∑‡πâ‡∏≠‡πÄ‡∏ß‡∏•‡∏≤‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ô‡∏≤‡∏ô‡πÜ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‡∏ô‡∏±‡πà‡∏á‡∏£‡∏≠‡∏ô‡∏≤‡∏ô‡∏à‡∏ô‡∏´‡∏¥‡∏ß‡∏ô‡πâ‡∏≥‡∏ô‡∏∞‡∏Ñ‡∏∞
*78 ‡πÇ‡∏Å‡πÇ‡∏Å‡πâ‡πÄ‡∏¢‡πá‡∏ô ‡∏ï‡∏≠‡∏ô‡∏ä‡∏á ‡∏ñ‡πâ‡∏≤‡∏°‡∏∂‡∏á‡∏à‡∏∞‡∏•‡∏∞‡∏•‡∏≤‡∏¢‡∏ô‡πâ‡∏≥‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏∞‡∏Å‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô‡πÄ

In [29]:
w3 = Widget(result, 'tokenized_comment')

ToggleButtons(index=6, options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), value=6)

*10 ['‡πÑ‡∏ó‡∏¢', '‡∏ô‡πâ‡∏≥', '‡πÅ‡∏Ç‡πá‡∏á', '‡πÅ‡∏ô‡πà‡∏ô‡πÅ‡∏Å‡πâ‡∏ß', '‡∏Ç‡∏ô‡∏≤‡∏î', '‡∏Å‡∏¥‡∏ô', '‡πÅ‡∏õ‡πä‡∏ö', '‡πÅ‡∏Å‡πâ‡∏ß', '‡∏ô‡πâ‡∏≥', '‡πÅ‡∏Ç‡πá‡∏á', '‡πÅ‡∏Å‡πâ‡∏ß']
*32 ['‡πÅ‡∏ó‡∏ö', '‡∏£‡∏™‡∏ä‡∏≤‡∏ï‡∏¥', '‡∏£‡∏≤‡∏Ñ‡∏≤', '‡∏ã‡∏∑‡πâ‡∏≠', '‡∏Å‡∏¥‡∏ô', '‡πÅ‡∏Å‡πâ‡∏ß', '‡πÄ‡∏≠‡∏¥‡πà‡∏°', '‡πÅ‡∏¢‡πà']
*35 ['‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡πå', '‡∏ó‡∏£‡∏π', '‡πÅ‡∏ñ‡∏°', '‡∏ä‡∏≤‡πÑ‡∏ó‡∏¢', '‡πÑ‡∏´‡∏°', '‡∏à‡πà‡∏≤‡∏¢', '‡πÅ‡∏Å‡πâ‡∏ß', '‡πÅ‡∏û‡∏á', '‡πÅ‡∏Å‡πâ‡∏ß', '‡∏™‡∏≠‡∏á', '‡∏ä‡∏≤', '‡πÑ‡∏ó‡∏¢', '‡πÑ‡∏°']
*85 ['‡∏£‡∏™‡∏ä‡∏≤‡∏ï‡∏¥', '‡πÅ‡∏¢‡πà', '‡∏à‡∏∑‡∏î', '‡πÅ‡∏Å‡πâ‡∏ß', '‡πÇ‡∏≠‡πÄ‡∏Ñ']
*95 ['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡∏´‡∏ô‡∏≤', '‡∏ú‡∏°', '‡∏´‡∏ô‡πâ‡∏≤', '‡πÅ‡∏Å‡πâ‡∏ß', '‡∏ô‡∏°']
*171 ['‡πÅ‡∏Å‡πâ‡∏ß']
*176 ['‡πÅ‡∏Å‡πâ‡∏ß', '‡∏™‡∏ß‡∏¢', '‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å', '‡πÅ‡∏õ‡∏•‡∏Å', '‡∏î‡∏µ', '‡∏™‡∏µ', '‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á', '‡∏î‡∏∑‡πà‡∏°', '‡∏£‡πâ‡∏≤‡∏ô']
*241 ['‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô', '‡∏ô‡πâ‡∏≥', '‡∏•‡πâ‡∏≤‡∏á', '‡πÅ‡∏Å‡πâ‡∏ß', '‡πÅ‡∏ñ‡∏°']


In [None]:
seed = 0
compare = 0

a = numpy.array(onehot_corpus.iloc[seed]).reshape(1, -1)
b = numpy.array(onehot_corpus.iloc[compare]).reshape(1, -1)
print(cosine_similarity(a,b))

print(idx_corpus[seed])
print(corpus[seed])
print(idx_corpus[compare])
print(corpus[compare])