# 은전한닢 설치 

In [None]:
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.14).
git is already the newest version (1:2.17.1-1ubuntu0.8).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 57 not upgraded.
mecab-ko is already installed
mecab-ko-dic is already installed
mecab-python is already installed
Done.


In [None]:
!pip install konlpy



In [None]:
import sys, re, argparse
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran": # 코모란
        tokenizer = Komoran()
    elif tokenizer_name == "okt": # Okt
        tokenizer = Okt()
    elif tokenizer_name == "mecab": # 은전한닢
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum": # 한나눔
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    elif tokenizer_name == "khaiii": # Khaiii
        tokenizer = KhaiiiApi()
    else:
        tokenizer = Mecab()
    return tokenizer

In [None]:
corpus_fname = './lsa_blog/processed_blog.txt'
output_fname = './lsa_tfidf/lsa_tfidf.vecs'
tokenizer_name = 'mecab'

In [None]:
!cp -rf '/content/drive/MyDrive/Colab Notebooks/lsa_blog' ./

In [None]:
!mkdir lsa_tfidf

mkdir: cannot create directory ‘lsa_tfidf’: File exists


# 학습데이터 전처리

In [None]:
tokenizer = get_tokenizer(tokenizer_name)
titles, raw_corpus, noun_corpus = [], [], []
with open(corpus_fname, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            title, document = line.strip().split("\u241E")
            titles.append(title)
            raw_corpus.append(document)
            nouns = tokenizer.nouns(document)
            noun_corpus.append(' '.join(nouns))
        except:
            continue



# TF-IDF 행렬 구축

In [None]:
# construct tf-idf matrix
vectorizer = TfidfVectorizer(
    min_df=1,
    ngram_range=(1, 1),
    lowercase=True,
    tokenizer=lambda x: x.split())
input_matrix = vectorizer.fit_transform(noun_corpus)


In [None]:
# TF-IDF 학습 결과 확인
id2vocab = {vectorizer.vocabulary_[token]:token for token in vectorizer.vocabulary_.keys()}
# curr_doc : 말뭉치 첫 번째 문서의 TF-IDF 벡터
curr_doc, result = input_matrix[0], []
# curr_doc 에서 TF-IDF 값이 0이 아닌 요소들을 내림차순 정렬
for idx, el in zip (curr_doc.indices, curr_doc.data):
  result.append((id2vocab[idx], el))
sorted(result, key=lambda x:x[1], reverse=True)


[('우도', 0.30935433754247393),
 ('최대', 0.2644197269001561),
 ('모델', 0.21509543930315736),
 ('디언', 0.20954601175351925),
 ('엔트로피', 0.20954601175351925),
 ('트', 0.2020801317026838),
 ('메터', 0.18998546457990625),
 ('파라', 0.18998546457990625),
 ('확률분포', 0.17931834019736734),
 ('디센트', 0.1740779030970291),
 ('값', 0.17266342644215812),
 ('랜덤', 0.16557258530957158),
 ('항', 0.16557258530957158),
 ('미지수', 0.15744508172541233),
 ('극값', 0.14287538240602252),
 ('데이터', 0.14259451611726803),
 ('도함수', 0.1350079326925599),
 ('식', 0.12618739748620753),
 ('추정', 0.12572760705211153),
 ('기대', 0.12268758057783158),
 ('자질', 0.11970371497430202),
 ('위', 0.11943917343447626),
 ('최대우도', 0.11605193539801942),
 ('다음', 0.10923480606337597),
 ('편미분', 0.10496338781694153),
 ('함수', 0.10284706007312461),
 ('크로스', 0.10074771767976153),
 ('하한', 0.09886091972122013),
 ('실제', 0.09729361111030081),
 ('번', 0.09251712088060558),
 ('손실', 0.09099347000775752),
 ('벡터', 0.08942193783459797),
 ('터', 0.08838695358356398),
 ('딥', 0.

# TF-IDF 행렬에 100차원 SVD를 수행  
## 204 * 37143 의 희소행렬을 204 * 100 크기의 밀집행렬로 선형변환한다. 

In [None]:
output_fname

'./lsa_tfidf/lsa_tfidf.vecs'

In [None]:
# compute truncated SVD
svd = TruncatedSVD(n_components=100)
vecs = svd.fit_transform(input_matrix)
with open(output_fname, 'w') as f:
    for doc_idx, vec in enumerate(vecs):
        str_vec = [str(el) for el in vec]
        f.writelines(titles[doc_idx] + "\u241E" + raw_corpus[doc_idx] + '\u241E' + ' '.join(str_vec) + "\n")

In [None]:
from sklearn.preprocessing import normalize
import numpy as np
import random

In [None]:
class LSAEvaluator:

    def __init__(self, model_fname="./lsa-tfidf/lsa-tfidf.vecs",
                 use_notebook=False):
        self.titles, self.vectors = self.load_model(model_fname)
        self.use_notebook = use_notebook

    def most_similar(self, doc_id, topn=10):
        query_doc_vec = self.vectors[doc_id]
        query_vec_norm = np.linalg.norm(query_doc_vec)
        if query_vec_norm != 0:
            query_unit_vec = query_doc_vec / query_vec_norm
        else:
            query_unit_vec = query_doc_vec
        query_sentence = self.titles[doc_id]
        scores = np.dot(self.vectors, query_unit_vec)
        return [query_sentence, sorted(zip(self.titles, scores), key=lambda x: x[1], reverse=True)[1:topn + 1]]

    def load_model(self, model_fname):
        titles, vectors = [], []
        with open(model_fname, 'r', encoding='utf-8') as f:
            for line in f:
                title, _, str_vec = line.strip().split("\u241E")
                vector = [float(el) for el in str_vec.split()]
                titles.append(title)
                vectors.append(vector)
        return titles, normalize(vectors, axis=1, norm='l2')

    def visualize(self, mode="between", num_sents=30, palette="Viridis256"):
        doc_idxes = random.sample(range(len(self.titles)), num_sents)
        sentences = [self.titles[idx] for idx in doc_idxes]
        vecs = [self.vectors[idx] for idx in doc_idxes]
        if mode == "between":
            visualize_between_sentences(sentences, vecs, palette, use_notebook=self.use_notebook)
        else:
            visualize_sentences(vecs, sentences, palette, use_notebook=self.use_notebook)

In [None]:
model = LSAEvaluator(model_fname='./lsa_tfidf/lsa_tfidf.vecs', use_notebook=True)
model.most_similar(doc_id=0)

['maxparam',
 [('loss', 0.7407780812410347),
  ('MLE', 0.7066002850757213),
  ('CRF', 0.679132753724507),
  ('unsugen', 0.6179739454587813),
  ('logistic', 0.5915017425705915),
  ('gradient', 0.5887206575501243),
  ('VAE', 0.5048666166888756),
  ('softmax', 0.500748761174644),
  ('NNtricks', 0.4904047730771453),
  ('MEMs', 0.4893151465792401)]]

# 시각화

In [None]:
model.visualize('between')

In [None]:
model.visualize('tsne')

In [34]:
!cp -rf 'lsa_tfidf' '/content/drive/MyDrive/Colab Notebooks'

# visualzise 관련 코드 (우선 실행)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

from bokeh.io import export_png, output_notebook, show
from bokeh.plotting import figure
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, TapTool, BoxSelectTool, LinearColorMapper, ColumnDataSource, LabelSet, SaveTool, ColorBar, BasicTicker
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.palettes import Spectral8

In [None]:
def visualize_between_sentences(sentences, vec_list, palette="Viridis256",
                                filename="./lsa_tfidf/between-sentences.png",
                                use_notebook=False):
    df_list, score_list = [], []
    for sent1_idx, sentence1 in enumerate(sentences):
        for sent2_idx, sentence2 in enumerate(sentences):
            vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx]
            if np.any(vec1) and np.any(vec2):
                score = cosine_similarity(X=[vec1], Y=[vec2])
                df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]})
                score_list.append(score[0][0])
    df = pd.DataFrame(df_list)
    color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list))
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(x_range=sentences, y_range=list(reversed(sentences)),
                x_axis_location="above", plot_width=900, plot_height=900,
                toolbar_location='below', tools=TOOLS,
                tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 3.14 / 3
    p.rect(x="x", y="y", width=1, height=1,
            source=df,
            fill_color={'field': 'similarity', 'transform': color_mapper},
            line_color=None)
    color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
                        color_mapper=color_mapper, major_label_text_font_size="7pt",
                        label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    if use_notebook:
        output_notebook()
        show(p)
    else:
        export_png(p, filename=filename)
        print("save @ " + filename)

In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[?25l[K     |▍                               | 10 kB 25.0 MB/s eta 0:00:01[K     |▊                               | 20 kB 27.6 MB/s eta 0:00:01[K     |█                               | 30 kB 19.7 MB/s eta 0:00:01[K     |█▌                              | 40 kB 12.9 MB/s eta 0:00:01[K     |█▉                              | 51 kB 5.6 MB/s eta 0:00:01[K     |██▏                             | 61 kB 5.8 MB/s eta 0:00:01[K     |██▌                             | 71 kB 5.5 MB/s eta 0:00:01[K     |███                             | 81 kB 6.2 MB/s eta 0:00:01[K     |███▎                            | 92 kB 5.9 MB/s eta 0:00:01[K     |███▋                            | 102 kB 5.3 MB/s eta 0:00:01[K     |████                            | 112 kB 5.3 MB/s eta 0:00:01[K     |████▍                           | 122 kB 5.3 MB/s eta 0:00:01[K     |████▊                           | 133 kB 5.3 MB/s eta 0:

In [None]:
from sklearn.manifold import TSNE

In [None]:
def visualize_sentences(vecs, sentences, palette="Viridis256", 
                        filename="./lsa_tfidf/sentences.png",
                        use_notebook=False):
    tsne = TSNE(n_components=2)
    tsne_results = tsne.fit_transform(vecs)
    df = pd.DataFrame(columns=['x', 'y', 'sentence'])
    df['x'], df['y'], df['sentence'] = tsne_results[:, 0], tsne_results[:, 1], sentences
    source = ColumnDataSource(ColumnDataSource.from_df(df))
    labels = LabelSet(x="x", y="y", text="sentence", y_offset=8,
                      text_font_size="12pt", text_color="#555555",
                      source=source, text_align='center')
    color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
    plot = figure(plot_width=900, plot_height=900)
    plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper}, line_color=None, fill_alpha=0.8)
    plot.add_layout(labels)
    if use_notebook:
        output_notebook()
        show(plot)
    else:
        export_png(plot, filename)
        print("save @ " + filename)
