In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cp '/content/drive/MyDrive/Colab Notebooks/processed_review_movieid.txt' ./

In [12]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [4]:
class Doc2VecInput:

    def __init__(self, fname, tokenizer_name="mecab"):
        self.fname = fname
        self.tokenizer = get_tokenizer(tokenizer_name)

    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for line in f:
                try:
                    sentence, movie_id = line.strip().split("\u241E")
                    tokens = self.tokenizer.morphs(sentence)
                    tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
                    yield tagged_doc
                except:
                    continue


In [5]:
!mkdir doc2vec

In [13]:
corpus_fname =  './processed_review_movieid.txt'
output_f_name = './doc2vec/doc2vec.model'
corpus = Doc2VecInput(corpus_fname)

In [None]:
model = Doc2Vec(corpus, vector_size=100)

In [15]:
model.save(output_f_name)

In [36]:
!cp -rf doc2vec '/content/drive/MyDrive/Colab Notebooks'

# 사전 작업 

In [None]:
# 은전한닢 설치
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
!pip install konlpy
import sys, re, argparse
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran": # 코모란
        tokenizer = Komoran()
    elif tokenizer_name == "okt": # Okt
        tokenizer = Okt()
    elif tokenizer_name == "mecab": # 은전한닢
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum": # 한나눔
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    elif tokenizer_name == "khaiii": # Khaiii
        tokenizer = KhaiiiApi()
    else:
        tokenizer = Mecab()
    return tokenizer

# 평가

In [40]:
from lxml import html
import requests, random
import numpy as np

In [16]:
class Doc2VecEvaluator:

    def __init__(self, model_fname="data/doc2vec.vecs", use_notebook=False):
        self.model = Doc2Vec.load(model_fname)
        self.doc2idx = {el:idx for idx, el in enumerate(self.model.docvecs.doctags.keys())}
        self.use_notebook = use_notebook

    def most_similar(self, movie_id, topn=10):
        similar_movies = self.model.docvecs.most_similar('MOVIE_' + str(movie_id), topn=topn)
        for movie_id, score in similar_movies:
            print(self.get_movie_title(movie_id), score)

    def get_titles_in_corpus(self, n_sample=5):
        movie_ids = random.sample(self.model.docvecs.doctags.keys(), n_sample)
        return {movie_id: self.get_movie_title(movie_id) for movie_id in movie_ids}

    def get_movie_title(self, movie_id):
        url = 'http://movie.naver.com/movie/point/af/list.nhn?st=mcode&target=after&sword=%s' % movie_id.split("_")[1]
        resp = requests.get(url)
        root = html.fromstring(resp.text)
        try:
            title = root.xpath('//div[@class="choice_movie_info"]//h5//a/text()')[0]
        except:
            title = ""
        return title

    def visualize_movies(self, n_sample=30, palette="Viridis256", type="between"):
        movie_ids = self.get_titles_in_corpus(n_sample=n_sample)
        movie_titles = [movie_ids[key] for key in movie_ids.keys()]
        movie_vecs = [self.model.docvecs[self.doc2idx[movie_id]] for movie_id in movie_ids.keys()]
        if type == "between":
            visualize_between_words(movie_titles, movie_vecs, palette, use_notebook=self.use_notebook)
        else:
            visualize_words(movie_titles, movie_vecs, palette, use_notebook=self.use_notebook)

In [32]:
model = Doc2VecEvaluator('./doc2vec/doc2vec.model', use_notebook=True)

In [33]:
model.get_titles_in_corpus(n_sample=3)

{'MOVIE_10035': '인디아나 존스 - 최후의 성전',
 'MOVIE_13494': '황비홍 3 - 사왕쟁패',
 'MOVIE_23461': '평양 폭격대'}

In [34]:
model.most_similar(104863, topn=5)

당신의 여자 0.8291001319885254
TV소설 사랑아 사랑아 0.8244854211807251
별도 달도 따줄게 0.8111724853515625
TV소설 은희 0.8033668994903564
사랑은 노래를 타고 0.8029811382293701


In [41]:
model.visualize_movies(type='between')

In [42]:
model.visualize_movies(type='tsne')

# visualzise 관련 코드 (우선 실행)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

from bokeh.io import export_png, output_notebook, show
from bokeh.plotting import figure
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, TapTool, BoxSelectTool, LinearColorMapper, ColumnDataSource, LabelSet, SaveTool, ColorBar, BasicTicker
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.palettes import Spectral8

def visualize_between_sentences(sentences, vec_list, palette="Viridis256",
                                filename="./lsa_tfidf/between-sentences.png",
                                use_notebook=False):
    df_list, score_list = [], []
    for sent1_idx, sentence1 in enumerate(sentences):
        for sent2_idx, sentence2 in enumerate(sentences):
            vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx]
            if np.any(vec1) and np.any(vec2):
                score = cosine_similarity(X=[vec1], Y=[vec2])
                df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]})
                score_list.append(score[0][0])
    df = pd.DataFrame(df_list)
    color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list))
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(x_range=sentences, y_range=list(reversed(sentences)),
                x_axis_location="above", plot_width=900, plot_height=900,
                toolbar_location='below', tools=TOOLS,
                tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 3.14 / 3
    p.rect(x="x", y="y", width=1, height=1,
            source=df,
            fill_color={'field': 'similarity', 'transform': color_mapper},
            line_color=None)
    color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
                        color_mapper=color_mapper, major_label_text_font_size="7pt",
                        label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    if use_notebook:
        output_notebook()
        show(p)
    else:
        export_png(p, filename=filename)
        
        print("save @ " + filename)

!pip install selenium


from sklearn.manifold import TSNE


def visualize_sentences(vecs, sentences, palette="Viridis256", 
                        filename="./lsa_tfidf/sentences.png",
                        use_notebook=False):
    tsne = TSNE(n_components=2)
    tsne_results = tsne.fit_transform(vecs)
    df = pd.DataFrame(columns=['x', 'y', 'sentence'])
    df['x'], df['y'], df['sentence'] = tsne_results[:, 0], tsne_results[:, 1], sentences
    source = ColumnDataSource(ColumnDataSource.from_df(df))
    labels = LabelSet(x="x", y="y", text="sentence", y_offset=8,
                      text_font_size="12pt", text_color="#555555",
                      source=source, text_align='center')
    color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
    plot = figure(plot_width=900, plot_height=900)
    plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper}, line_color=None, fill_alpha=0.8)
    plot.add_layout(labels)
    if use_notebook:
        output_notebook()
        show(plot)
    else:
        export_png(plot, filename)
        print("save @ " + filename)


Collecting selenium
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[K     |████████████████████████████████| 904 kB 9.4 MB/s 
Installing collected packages: selenium
Successfully installed selenium-3.141.0


In [28]:

def visualize_between_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/between-words.png",
                            use_notebook=False):
    df_list = []
    for word1_idx, word1 in enumerate(words):
        for word2_idx, word2 in enumerate(words):
            vec1 = vecs[word1_idx]
            vec2 = vecs[word2_idx]
            if np.any(vec1) and np.any(vec2):
                score = cosine_similarity(X=[vec1], Y=[vec2])
                df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]})
    df = pd.DataFrame(df_list)
    color_mapper = LinearColorMapper(palette=palette, low=1, high=0)
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(x_range=list(words), y_range=list(reversed(list(words))),
               x_axis_location="above", plot_width=900, plot_height=900,
               toolbar_location='below', tools=TOOLS,
               tooltips=[('words', '@x @y'), ('similarity', '@similarity')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 3.14 / 3
    p.rect(x="x", y="y", width=1, height=1,
           source=df,
           fill_color={'field': 'similarity', 'transform': color_mapper},
           line_color=None)
    color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
                         color_mapper=color_mapper, major_label_text_font_size="7pt",
                         label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    if use_notebook:
        output_notebook()
        show(p)
    else:
        export_png(p, filename)
        print("save @ " + filename)

In [30]:
def visualize_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/words.png",
                    use_notebook=False):
    tsne = TSNE(n_components=2)
    tsne_results = tsne.fit_transform(vecs)
    df = pd.DataFrame(columns=['x', 'y', 'word'])
    df['x'], df['y'], df['word'] = tsne_results[:, 0], tsne_results[:, 1], list(words)
    source = ColumnDataSource(ColumnDataSource.from_df(df))
    labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                      text_font_size="15pt", text_color="#555555",
                      source=source, text_align='center')
    color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
    plot = figure(plot_width=900, plot_height=900)
    plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper}, line_color=None,
                 fill_alpha=0.8)
    plot.add_layout(labels)
    if use_notebook:
        output_notebook()
        show(plot)
    else:
        export_png(plot, filename)
        print("save @ " + filename)