In [1]:
import math

import numpy
import pandas

import ipywidgets as widgets
from IPython.display import display, clear_output

import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import pythainlp

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

from data_tokenizer import load_corpus

from model.upgrade_sdc import UpgradeSDC
from model.sdc import SDC

### Load Data

In [2]:
file_name = '‡∏ú‡∏π‡πâ‡∏ö‡∏£‡∏¥‡πÇ‡∏†‡∏Ñ - TescoLotus.txt'

corpus, labels = load_corpus('../data/facebook/' + file_name)

len_corpus = len(corpus)
print('Total documents', len_corpus)

clusters = list(set(labels))
print(len(clusters), 'clusters')

f = open('../data/facebook/tokenized/tokenized_' + file_name)
tokenized_corpus = eval(f.read())
f.close()

Total documents 268
1 clusters


### Preprocess Corpus

#### Remove Words

In [3]:
dictionary = Dictionary(tokenized_corpus)
print('origin:', len(dictionary), 'words')

dictionary.filter_extremes(no_below=2, no_above=0.7, keep_n=len(dictionary))
print('filter frequent words:', len(dictionary), 'words')

letter_words = [id for id in range(len(dictionary)) if len(dictionary[id]) <= 1] 
dictionary.filter_tokens(bad_ids=letter_words)
print('filter letter words:', len(dictionary), 'words')

stopwords = pythainlp.corpus.stopwords.words('thai')
stopwords.append('‡∏ô‡∏µ‡πâ')
dictionary.add_documents([stopwords])
stopwords = [dictionary.token2id[word] for word in stopwords]
dictionary.filter_tokens(bad_ids=stopwords)
print('filter stop words:', len(dictionary), 'words')

origin: 1449 words
filter frequent words: 605 words
filter letter words: 604 words
filter stop words: 403 words


In [4]:
idx_corpus = [dictionary.doc2idx(doc) for doc in tokenized_corpus]

temp_corpus = []
for doc in idx_corpus:
    temp_corpus.append([dictionary[id] for id in doc if id >= 0])
idx_corpus = temp_corpus

#### Dimension Reduction

In [5]:
average_doc_size = 0
for doc in idx_corpus:
    average_doc_size += len(doc)
average_doc_size /= len(idx_corpus)
average_doc_size = math.ceil(average_doc_size)

df = dictionary.dfs
filtered_corpus = []
for doc in idx_corpus:
    new_doc = [(word, df[dictionary.token2id[word]]) for word in doc]
    new_doc.sort(reverse=True, key=lambda x: x[1])
    new_doc = new_doc[:average_doc_size]
    filtered_corpus.append([word for word, df in new_doc])

#### Doc2Vec

In [6]:
tagged_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(idx_corpus)]
model = Doc2Vec(tagged_corpus, vector_size=average_doc_size, window=4, min_count=2, epochs=100)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

paragraph_vectors = [model.infer_vector(doc) for doc in idx_corpus]
paragraph_vectors = pandas.DataFrame(paragraph_vectors, dtype=float)

### Clustering

In [7]:
def get_onehot(corpus, weight):
    dictionary = Dictionary(corpus)
#     dictionary.filter_extremes(no_below=2, no_above=1, keep_n=len(dictionary))

    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
    if weight == 'normal':
        weight_corpus = bow_corpus
    elif weight == 'tfidf':
        tfidf = TfidfModel(bow_corpus, smartirs='ltc')
        weight_corpus = [tfidf[doc] for doc in bow_corpus]

    unique_words = [dictionary[id] for id in range(len(dictionary))]
    array = numpy.zeros((len(corpus), len(unique_words)), dtype=float)
    for i, doc in enumerate(weight_corpus):
        for id, score in doc:
            array[i, id] = score

        if weight == 'normal' and len(doc) != 0:
#             array[i] = numpy.divide(array[i], len(idx_corpus[i]))
            array[i] = numpy.divide(array[i], len(doc))
    
    return pandas.DataFrame(array, columns=unique_words, dtype=float)

In [8]:
def generate_result(predicted_labels):
    result = pandas.DataFrame()
    result['comment'] = corpus
    result['tokenized_comment'] = filtered_corpus
    result['label'] = labels
    result['predicted_label'] = predicted_labels
    return result

In [9]:
def eval_cluster(onehot_corpus, result):
    label_count = numpy.unique(result['predicted_label'])
    num_cluster = label_count[-1] + 1

    clusters = [[] for i in range(num_cluster)]
    corpus_centroid = []
    for i, label in result['predicted_label'].iteritems():
        clusters[label].append(numpy.array(onehot_corpus.iloc[i]))
        corpus_centroid.append(numpy.array(onehot_corpus.iloc[i]))
    corpus_centroid = numpy.mean(corpus_centroid, axis=0).reshape(1, -1)   

#     print('\tIntra cluster sim\tInter cluster sim\tIntra / Inter')
    compactness = 0
    centroids = []
    for i in range(num_cluster):
        size = len(clusters[i])
        if size != 0:
            centroid = numpy.mean(clusters[i], axis=0)
            centroids.append(centroid)
            centroid = centroid.reshape(1, -1)
            similarities = cosine_similarity(centroid, clusters[i])
            compactness += numpy.sum(similarities)

#             intra = numpy.sum(similarities) / size
#             inter = cosine_similarity(centroid, corpus_centroid)[0][0]
#             print(i, end='\t')
#             print(intra, end='\t')
#             print(inter, end='\t')
#             print(intra / inter)
    return compactness, centroids

In [10]:
min_samples = 7
eps = 0.32
epoch = 10

# onehot_corpus = get_onehot(idx_corpus, 'normal')
onehot_corpus = get_onehot(filtered_corpus, 'tfidf')

max_compactness = 0
for i in range(epoch):
    model = UpgradeSDC()
    _tpredicted_labels, _tmarks = model.predict(onehot_corpus, min_samples, eps)
    
#     _tmarks = None
#     model = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples).fit(onehot_corpus)
#     _tpredicted_labels = model.labels_ + 1

#     model = KMeans(n_clusters=7).fit(onehot_corpus)
#     _tpredicted_labels = model.labels_
    
    _tresult = generate_result(_tpredicted_labels)
    compactness, _tcentroids = eval_cluster(onehot_corpus, _tresult)
    
    if compactness > max_compactness:
        max_compactness = compactness
        predicted_labels = _tpredicted_labels
        marks = _tmarks
        result = _tresult
        centroids = _tcentroids
        
print(max_compactness)
label_count = numpy.unique(result['predicted_label'], return_counts=True) 
num_cluster = label_count[0][-1] + 1
print(label_count, '\n')

107.12412250399923
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]), array([117,   8,  11,   9,  20,  14,  10,   8,   7,  26,   8,  10,  12,
         8])) 



In [11]:
sims = cosine_similarity(centroids)
new_labels = [i for i in range(num_cluster)]
print(new_labels)
for i, row in reversed(list(enumerate(sims))):
    for j, value in reversed(list(enumerate(row[:i + 1]))):
        if i != j and value >= eps - eps / 20:
            print(i, j, value)
            new_labels = [new_labels[j] if label == new_labels[i] else label for label in new_labels]
print(new_labels)

grouped_labels = numpy.zeros(len_corpus)
for i, label in enumerate(predicted_labels):
    grouped_labels[i] = new_labels[label]
new_result = generate_result(grouped_labels)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
13 8 0.4810923160872279
11 4 0.5034346049453596
11 1 0.3414442782023652
11 0 0.4459732482323
10 2 0.48483122344088264
5 4 0.31819192790351114
4 2 0.3704082154205951
4 1 0.5261339400224656
4 0 0.4531235340053271
1 0 0.3371756929043165
[2, 2, 2, 3, 2, 2, 6, 7, 8, 9, 2, 2, 12, 8]


### Result

In [12]:
class Widget:
    def __init__(self, result, marks, column_name):
        self.result = result
        self.column_name = column_name
        self.marks = marks
        
        label_count = numpy.unique(result['predicted_label'])
        self.widget = widgets.ToggleButtons(
            options=[int(num) for num in label_count],
            disabled=False,
            button_style='',
        )
        
        self.widget.observe(self.on_click, names='index')
        self.on_click({'new' : 0})
        
    def on_click(self, change):
        clear_output()
        display(self.widget)
        new = self.widget.options[change['new']]
        for index, value in self.result[self.result['predicted_label'] == new][self.column_name].iteritems():
            if self.marks:
                if index in self.marks[0]:
                    print("@", end="")
                elif index in self.marks[1]:
                    print("*", end="")
            print(index, value)

In [13]:
w1 = Widget(new_result, marks, 'comment')

ToggleButtons(options=(2, 3, 6, 7, 8, 9, 12), value=2)

*0 ‡πÇ‡∏•‡∏ï‡∏±‡∏™‡πÑ‡∏Æ‡πÄ‡∏õ‡∏≠‡∏£‡πå‡∏°‡∏≤‡πÄ‡∏Å‡πá‡∏ï‚Äã ‡∏ó‡∏∏‡∏Å‡∏™‡∏≤‡∏Ç‡∏≤‚Äã ‡∏Ñ‡∏ß‡∏£‡∏à‡πâ‡∏≤‡∏á‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡πÄ‡∏¢‡∏≠‡∏∞‡∏Å‡∏ß‡πà‡∏≤‡∏ô‡∏µ‡πâ‚Äã ‡πÑ‡∏°‡πà‡∏Ñ‡∏ß‡∏£‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡πÅ‡∏ú‡∏ô‡∏Å‡∏≠‡∏∑‡πà‡∏ô‡πÑ‡∏õ‡∏•‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÅ‡∏Ñ‡∏ä‡πÄ‡∏ä‡∏µ‡∏¢‡∏£‡πå‚Äã‡∏ö‡πà‡∏≠‡∏¢‡∏à‡∏ô‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ‚Äã ‡∏ó‡∏∏‡∏Å‡∏Ñ‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏´‡∏ô‡πâ‡∏≤‡∏ó‡∏µ‡πà‡∏ó‡∏µ‡πà‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏ú‡∏¥‡∏î‡∏ä‡∏≠‡∏ö‚Äã ‡∏û‡∏≠‡∏ä‡πà‡∏ß‡∏¢‡πÅ‡∏Ñ‡∏ä‡πÄ‡∏ä‡∏µ‡∏¢‡∏£‡πå‡πÄ‡∏™‡∏£‡πá‡∏à‚Äã ‡∏á‡∏≤‡∏ô‡πÅ‡∏ú‡∏ô‡∏Å‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡πÄ‡∏•‡∏∞‡∏Å‡πá‡πÇ‡∏î‡∏ô‡∏î‡πà‡∏≤‚Äã ‡∏û‡∏≠‡πÇ‡∏î‡∏ô‡∏î‡πà‡∏≤‡∏Å‡πá‡πÄ‡∏Å‡∏¥‡∏î‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î‚Äã ‡∏´‡∏ô‡πâ‡∏≤‡∏ö‡∏∂‡πâ‡∏á‡∏ï‡∏∂‡∏á‡πÉ‡∏™‡πà‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‚Äã ‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç‚Äã ‡∏ö‡∏£‡∏¥‡∏Å‡∏≤‡∏£‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏î‡∏µ‚Äã ‡πÅ‡∏•‡πâ‡∏ß‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‡∏Å‡πá‡∏£‡πâ‡∏≠‡∏á‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‚Äã ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏£‡∏±‡∏Å‡∏à‡∏∂‡∏á‡∏≠‡∏¢‡∏≤‡∏Å‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‚Äã ‡∏à‡∏≤‡∏Å

*65 ‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡πÄ‡∏•‡∏¢ ‡πÑ‡∏õ‡∏Ç‡∏≠‡πÉ‡∏ö‡∏Å‡∏≥‡∏Å‡∏±‡∏ö‡∏†‡∏≤‡∏©‡∏µ ‡∏ó‡∏≥‡∏´‡∏ô‡πâ‡∏≤‡πÅ‡∏ö‡∏ö‡∏ú‡∏±‡∏ß‡πÑ‡∏°‡πà‡πÄ‡∏¢‡∏°‡∏™‡∏ä‡∏≤‡∏ï‡∏¥‡∏Å‡∏ß‡πà‡∏≤ ‡∏û‡∏π‡∏î‡∏Å‡πá‡πÑ‡∏°‡πà‡∏î‡∏µ ‡∏≠‡∏¢‡∏≤‡∏Å‡πÄ‡∏≠‡∏≤‡∏¢‡∏≤‡∏á‡πÑ‡∏õ‡∏î‡∏µ‡∏î‡∏´‡∏µ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏ï‡∏±‡∏ß ‡∏°‡∏∂‡∏á‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡∏ö‡∏£‡∏¥‡∏Å‡∏≤‡∏£ ‡∏°‡∏∂‡∏á‡∏°‡∏µ‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡πå‡∏≠‡∏∞‡πÑ‡∏£‡∏°‡∏≤‡∏á‡∏µ‡πà‡πÄ‡∏á‡πà‡∏≤‡πÉ‡∏™‡πà‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤‡∏≠‡πà‡∏∞ ‡∏ö‡πâ‡∏≤‡∏ö‡∏≠‡∏°‡∏≤‡∏Å
66 ‡πÄ‡∏Å‡∏∑‡∏≠‡∏ö‡∏ó‡∏∏‡∏Å‡∏™‡∏≤‡∏Ç‡∏≤ ‡πÄ‡∏ß‡∏•‡∏≤‡∏°‡∏µ‡πÇ‡∏õ‡∏£‡πÇ‡∏°‡∏ä‡∏±‡πà‡∏ô‡∏≠‡∏∞‡πÑ‡∏£ ‡πÑ‡∏°‡πà‡∏£‡∏π‡πâ ‡∏°‡∏≤‡∏ñ‡∏≤‡∏°‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤ ‡∏ß‡πà‡∏≤‡∏•‡∏î‡∏°‡∏±‡πâ‡∏¢ ‡∏•‡∏î‡∏î‡πâ‡∏ß‡∏¢‡πÄ‡∏´‡∏£‡∏≠ ‡πÄ‡∏≠‡∏¥‡πà‡∏°... ‡πÅ‡∏•‡πâ‡∏ß‡∏ï‡∏π‡∏à‡∏∞‡∏£‡∏π‡πâ‡∏°‡∏±‡πâ‡∏¢‡∏≠‡πà‡∏∞ ‡∏ö‡∏≤‡∏á‡∏ó‡∏µ‡∏Å‡πá‡∏á‡∏á‡πÜ ‡∏Å‡∏∞‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô
67 ‡πÇ‡∏•‡∏ï‡∏±‡∏™‡πÄ‡∏≠‡πá‡∏Å‡πÄ‡∏û‡∏£‡∏™‡πÅ‡∏¢‡πà‡∏´‡∏ô‡πâ‡∏≤‡∏¢‡∏±‡∏á‡∏Å‡∏∞‡∏ú‡∏±‡∏ß‡∏ó‡∏¥‡πâ‡∏á. ‡∏ô‡πà‡∏≤‡∏à‡∏±‡∏ö‡∏≠‡∏ö‡∏£‡∏°‡πÉ‡∏´‡πâ‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤‡∏ô‡∏µ‡πâ
69 ‡∏´‡∏°‡∏π

In [14]:
w2 = Widget(result, marks, 'comment')

ToggleButtons(options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), value=0)

2 ‡πÄ‡∏•‡∏¥‡∏Å‡πÄ‡∏õ‡∏¥‡∏î‡πÄ‡∏û‡∏•‡∏á ‡∏Ç‡πâ‡∏≤‡∏ß‡πÅ‡∏™‡∏ô‡∏î‡∏µ ‡∏Å‡∏±‡∏ö‡∏≠‡∏µ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏Å‡∏£‡∏≠‡∏á‡∏ô‡πâ‡∏≥‡πÄ‡∏û‡∏µ‡∏¢‡∏ß ‡πÑ‡∏î‡πâ‡πÅ‡∏•‡πâ‡∏ß!!!!
8 ‡∏≠‡∏¢‡πà‡∏≤‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡∏ô‡πâ‡∏≠‡∏á‡πÄ‡∏Ç‡πâ‡∏≤‡∏õ‡∏£‡∏∞‡∏ä‡∏∏‡∏°‡πÄ‡∏ä‡∏µ‡∏¢‡∏£‡πå ‡∏≠‡∏¢‡πà‡∏≤‡∏•‡∏á‡πÇ‡∏ó‡∏©‡πÇ‡∏î‡∏¢‡πÄ‡∏´‡∏ï‡∏∏‡∏ú‡∏•‡∏á‡∏µ‡πà‡πÄ‡∏á‡πà‡∏≤‡πÜ ‡πÄ‡∏Å‡∏¥‡∏î‡∏Å‡πà‡∏≠‡∏ô‡πÑ‡∏°‡πà‡∏Å‡∏µ‡πà‡∏õ‡∏µ‡πÄ‡∏≠‡∏á
9 ‡πÇ‡∏•‡∏ï‡∏±‡∏™‡πÉ‡∏´‡∏ç‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏≠‡∏∞‡πÑ‡∏£‡∏ô‡∏∞ ‡πÇ‡∏•‡∏ï‡∏±‡∏™‡πÄ‡∏≠‡∏Å‡πÄ‡∏û‡∏£‡∏™‡πÅ‡∏°‡πà‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏´‡∏ß‡∏à‡∏£‡∏¥‡∏á‡πÜ ‡πÄ‡∏Å‡∏•‡∏µ‡∏¢‡∏î‡∏Ç‡∏µ‡πâ‡∏´‡∏ô‡πâ‡∏≤‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô
14 ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ú‡∏±‡∏Å ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÅ‡∏•‡πâ‡∏ß ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏≠‡∏µ‡∏Å ‡∏û‡∏≠‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î‡πÄ‡∏á‡∏¥‡∏ô‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡∏¢‡∏±‡∏î‡∏ú‡∏±‡∏Å‡πÉ‡∏™‡πà‡∏ñ‡∏∏‡∏á‡∏à‡∏ô‡∏ú‡∏±‡∏Å‡∏´‡∏±‡∏Å ‡πÇ‡∏≠‡πâ‡∏¢‡πÉ‡∏à ‡πÉ‡∏à‡∏™‡∏•‡∏≤‡∏¢
15 ‡∏°‡∏≤‡∏£‡∏¢‡∏≤‡∏ó‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô ‡πÄ‡∏ã‡πÄ‡∏ß‡πà‡∏ô‡∏¢‡∏±‡∏á‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤‡∏´‡∏ô‡πà‡∏≠‡∏¢‡∏ô‡∏∂‡∏á ‡πÅ‡∏ö‡∏ö‡∏ß‡πà‡∏≤‡πÄ‡∏´‡∏µ‡πâ‡∏¢

In [15]:
w3 = Widget(result, marks, 'tokenized_comment')

ToggleButtons(options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), value=0)

2 ['‡∏î‡∏µ', '‡πÄ‡∏•‡∏¥‡∏Å', '‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á', '‡∏ô‡πâ‡∏≥', '‡∏Ç‡πâ‡∏≤‡∏ß', '‡∏≠‡∏µ']
8 ['‡∏≠‡∏¢‡πà‡∏≤', '‡∏≠‡∏¢‡πà‡∏≤', '‡∏Å‡∏µ‡πà', '‡∏õ‡∏µ', '‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö', '‡∏ô‡πâ‡∏≠‡∏á', '‡πÄ‡∏ä‡∏µ‡∏¢‡∏£‡πå', '‡∏á‡∏µ‡πà', '‡πÄ‡∏á‡πà‡∏≤']
9 ['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡πÇ‡∏•‡∏ï‡∏±‡∏™', '‡πÇ‡∏•‡∏ï‡∏±‡∏™', '‡πÅ‡∏°‡πà‡∏á', '‡πÄ‡∏Å‡∏•‡∏µ‡∏¢‡∏î', '‡πÄ‡∏≠‡∏Å‡πÄ‡∏û‡∏£‡∏™']
14 ['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡πÄ‡∏á‡∏¥‡∏ô', '‡∏ñ‡∏∏‡∏á', '‡πÉ‡∏™‡πà', '‡∏ú‡∏±‡∏Å', '‡∏ú‡∏±‡∏Å', '‡∏ú‡∏±‡∏Å', '‡πÄ‡∏•‡∏∑‡∏≠‡∏Å', '‡πÄ‡∏•‡∏∑‡∏≠‡∏Å', '‡πÄ‡∏•‡∏∑‡∏≠‡∏Å', '‡∏¢‡∏±‡∏î']
15 ['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡∏î‡∏µ', '‡∏°‡∏≤‡∏£‡∏¢‡∏≤‡∏ó', '‡∏ô‡∏∂‡∏á', '‡πÄ‡∏ã‡πÄ‡∏ß‡πà‡∏ô', '‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û', '‡πÄ‡∏´‡∏µ‡πâ‡∏¢', '‡∏Ñ‡∏π‡πà']
19 ['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤', '‡∏´‡∏ô‡πâ‡∏≤', '‡πÇ‡∏î‡∏ô', '‡∏ä‡∏≠‡∏ö', '‡∏ö‡∏π‡∏î', '‡πÄ‡∏ä‡πâ‡∏≤', '‡πÄ‡∏´‡πá‡∏ô‡πÉ‡∏à', '‡∏Æ‡∏∞']
23 ['‡∏´‡∏ô‡πâ‡∏≤', '‡∏´‡∏ô‡πâ‡∏≤', '‡∏Å‡∏≤‡∏£‡πå‡∏î', '‡∏Ñ‡∏•‡∏±‡∏ö', '‡πÄ‡∏ä‡πá‡∏Ñ', '‡∏á‡∏≠', '‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ', '‡∏ï‡∏≤‡∏¢', '‡πÇ‡∏≠‡∏Å‡∏≤‡∏

In [63]:
seed = 4
compare = 19

a = numpy.array(onehot_corpus.iloc[seed]).reshape(1, -1)
b = numpy.array(onehot_corpus.iloc[compare]).reshape(1, -1)
print(cosine_similarity(a,b))
print(filtered_corpus[seed])
print(filtered_corpus[compare])

# print(sims[10])

[[0.00923844]]
['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡∏™‡∏≤‡∏Ç‡∏≤', '‡∏ó‡∏≥', '‡∏ó‡∏≥', '‡∏ó‡∏≥', '‡∏Ñ‡∏ô', '‡∏á‡∏≤‡∏ô', '‡∏á‡∏≤‡∏ô', '‡∏õ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∏‡∏á', '‡πÄ‡∏ï‡πá‡∏°‡πÉ‡∏à', '‡πÇ‡∏•‡∏ï‡∏±‡∏™‡πÄ‡∏≠‡πá‡∏Å‡∏ã‡πå']
['‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô', '‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤', '‡∏´‡∏ô‡πâ‡∏≤', '‡πÇ‡∏î‡∏ô', '‡∏ä‡∏≠‡∏ö', '‡∏ö‡∏π‡∏î', '‡πÄ‡∏ä‡πâ‡∏≤', '‡πÄ‡∏´‡πá‡∏ô‡πÉ‡∏à', '‡∏Æ‡∏∞']
