In [1]:
import re
import math
import sys
import random

import numpy
import pandas

import ipywidgets as widgets
from IPython.display import display, clear_output

import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

import pythainlp

from sklearn.metrics.pairwise import cosine_similarity

from data_tokenizer import load_corpus

### Load Data

In [2]:
file_name = 'ผู้บริโภค - TrueCoffee.txt'

corpus, labels = load_corpus('../data/facebook/' + file_name)

len_corpus = len(corpus)
print('Total documents', len_corpus)

clusters = list(set(labels))
print(len(clusters), 'clusters')

f = open('../data/facebook/tokenized/tokenized_' + file_name)
tokenized_corpus = eval(f.read())
f.close()

Total documents 353
1 clusters


### Preprocess Corpus

#### Remove Words

In [3]:
dictionary = Dictionary(tokenized_corpus)
print('origin:', len(dictionary), 'words')

dictionary.filter_extremes(no_below=2, no_above=0.7, keep_n=len(dictionary))
print('filter frequent words:', len(dictionary), 'words')

letter_words = [id for id in range(len(dictionary)) if len(dictionary[id]) <= 1] 
dictionary.filter_tokens(bad_ids=letter_words)
print('filter letter words:', len(dictionary), 'words')

stopwords = pythainlp.corpus.stopwords.words('thai')
stopwords.append('นี้')
dictionary.add_documents([stopwords])
stopwords = [dictionary.token2id[word] for word in stopwords]
dictionary.filter_tokens(bad_ids=stopwords)
print('filter stop words:', len(dictionary), 'words')

origin: 953 words
filter frequent words: 374 words
filter letter words: 372 words
filter stop words: 221 words


In [4]:
# bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_corpus]
idx_corpus = [dictionary.doc2idx(doc) for doc in tokenized_corpus]

temp_corpus = []
for doc in idx_corpus:
    temp_corpus.append([dictionary[id] for id in doc if id >= 0])
idx_corpus = temp_corpus

#### Dimension Reduction

In [5]:
average_doc_size = 0
for doc in idx_corpus:
    average_doc_size += len(doc)
average_doc_size /= len(idx_corpus)
average_doc_size = math.ceil(average_doc_size)
average_doc_size

df = dictionary.dfs
filtered_corpus = []
for doc in idx_corpus:
    new_doc = [(word, df[dictionary.token2id[word]]) for word in doc]
    new_doc.sort(reverse=True, key=lambda x: x[1])
    new_doc = new_doc[:average_doc_size]
    filtered_corpus.append([word for word, df in new_doc])

### SDC

In [6]:
def get_bow(corpus):
    new_dict = Dictionary(corpus)

    # new_dict.filter_extremes(no_below=2, no_above=1, keep_n=len(new_dict))
    # print(len(new_dict))

    unique_words = [new_dict[id] for id in range(len(new_dict))]
    array = numpy.zeros((len_corpus, len(unique_words)), dtype=float)
    
    for i, doc in enumerate(corpus):
        for word in doc:
            array[i, new_dict.token2id[word]] += 1

        ## normalization
        if len(doc) != 0:
            array[i] = numpy.divide(array[i], len(doc))

    return pandas.DataFrame(array, columns=unique_words, dtype=float)

In [7]:
def sdc(bow_corpus, min_samples, eps):
    delta_eps = eps / 10
    labels = [-1 for i in range(len(bow_corpus))]
    sims = cosine_similarity(bow_corpus)
    
    points = [i for i in range(len(bow_corpus))]
    cluster_num = 0
    while len(points) > 0:
        seed = random.choice(points)
        eps_neighbors = [i for i, sim in enumerate(sims[seed]) if sim >= eps]
        if len(eps_neighbors) >= min_samples:
            cluster_num += 1
            for p in eps_neighbors:
                labels[p] = cluster_num
            points = [i for i in points if i not in eps_neighbors]
        else:
            labels[seed] = 0
            points.remove(seed)
    
    while cluster_num != 0:
        cluster = [numpy.array(bow_corpus.iloc[i]) for i, label in enumerate(labels) if label == cluster_num]
        eps_temp = eps
        
        while True:
            centroid = numpy.mean(cluster, axis=0).reshape(1, -1)
            eps_temp -= delta_eps
            
            count = 0
            for i, label in enumerate(labels):
                point = numpy.array(bow_corpus.iloc[i]).reshape(1, -1)
                if label == 0 and cosine_similarity(centroid, point) >= eps_temp:
                    cluster.append(point[0])
                    labels[i] = cluster_num
                    count += 1
            if count == 0:
                break
        
        cluster_num -= 1
    
    return labels

In [8]:
def predict_cluster(bow_corpus, min_samples, eps):
    predicted_labels = sdc(bow_corpus, min_samples, eps)

    result = pandas.DataFrame()
    result['comment'] = corpus
    result['tokenized_comment'] = idx_corpus
    result['label'] = labels
    result['predicted_label'] = predicted_labels
    
    return result

In [9]:
# bow_corpus = get_bow(idx_corpus)
bow_corpus = get_bow(filtered_corpus)
result = predict_cluster(bow_corpus, 5, 0.2)
bow_corpus.head()

Unnamed: 0,กาแฟ,ชง,ร้าน,สวย,ทรูมูฟ,ลูกค้า,สิทธิ์,เต็ม,โปร,รสชาติ,...,แถว,ชัด,บริการ,โปรโมชั่น,ชม,เวลา,ชาไทย,หวัง,เข้มข้น,ดืม
0,0.2,0.2,0.2,0.4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.4,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Result

In [10]:
label_count = numpy.unique(result['predicted_label'], return_counts=True) 
num_clusters = len(label_count[0])
print(label_count)

for cluster in clusters:
    print('\t' + cluster, end='')
print('\tpercent')

for label in range(len(clusters)):
    print(str(label) + '  |', end='')
    
    num_max = 0
    for cluster in clusters:
        loc = result[(result['label'] == cluster) & (result['predicted_label'] == label)]
        if len(loc) > num_max:
            num_max = len(loc)
        print('\t' + str(len(loc)), end='')
    
    print('\t' + str(num_max / label_count[1][label]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]), array([22, 11,  5,  1,  3, 22, 23,  4, 16,  5, 14, 24,  4,  6,  2,  6,  4,
        4,  6,  7,  6,  8, 14, 14,  7,  5,  7,  9,  5, 30,  5, 26, 21,  7]))
		percent
0  |	22	1.0


In [11]:
comment_widget = widgets.ToggleButtons(
    options=[num + 1 for num in range(num_clusters)],
    disabled=False,
    button_style='',
)

def on_comment_widget_click(change):
    clear_output()
    display(comment_widget)
    for index, value in result[result['predicted_label'] == change['new']]['comment'].iteritems():
        print(index, value)

comment_widget.observe(on_comment_widget_click, names='index')
on_comment_widget_click({'new' : 0})

ToggleButtons(options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, …

107 ทุกสูตรที่มี
111 เนทกาก ฝนตกปรอยๆทีวีบ้านกุดูไม่ได้ละ  นะเห้ย
124 แพงเกอน แต่รสชาติเอิ่มมมม
125 คุณเมิงจะครอบคลุมทุกอย่างเลยแม่นบ่ 555
128 ความเป็นออริจินอล
133 พนงหน้าตูด สาขาเซ้นทรัลพระราม2
139 เครือข่าย ละกัน 555
151 จางมากกกกก ไม่โอเครเลย
155 ไม่เคยทาน
164 อันนี้ก็ไม่ต่างจากสตาร์บั๊ค
181 ไม่ค่อยมีคลื่น
208 ลาเต้เย็นควรใส่นมเยอะๆนะค่ะ นี่ขมอย่ากะอเมริกาโน่
228 แพงเกิ้นนน
242 เมื่อไหร่จะมี
259 รสชาดห่วยสุดในสามโลก
265 ครั้งเดียวก็เกินจะพอ
266 อันนี้มาแนวเดียวกะแมคคาเฟ่
293 โลโก้
306 กระจอกมาก
319 สาขาน้อย
331 มาอ่านคอมเม้น 5555555555555555
349 หมาไม่ดืมว่ะ


In [12]:
token_widget = widgets.ToggleButtons(
    options=[num + 1 for num in range(num_clusters)],
    disabled=False,
    button_style='',
)

def on_token_widget_click(change):
    clear_output()
    display(token_widget)
    for index, value in result[result['predicted_label'] == change['new']]['tokenized_comment'].iteritems():
        print(index, value)

token_widget.observe(on_token_widget_click, names='index')
on_token_widget_click({'new' : 0})

ToggleButtons(options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, …

107 []
111 []
124 ['รส', 'ชาติ']
125 []
128 []
133 ['พนง']
139 []
151 ['จาง']
155 ['ทาน']
164 []
181 []
208 ['เย็น', 'ใส่', 'นม', 'ขม']
228 []
242 ['ไหร่']
259 ['รส']
265 []
266 []
293 []
306 []
319 []
331 []
349 ['ดืม']
