# Step 1. TF-IDF

In [None]:
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
sentence = "이순신은 조선 중기의 무신이다. 이순신은 임진왜란에서 조선을 승리로 이끈 명장이다."
tokenizer = TreebankWordTokenizer()
tokenize = tokenizer.tokenize
token_sequence = tokenize(sentence)

In [None]:
token_sequence

In [None]:
lexicon = ['이순신은', '조선', '충무공']
vector1 = Counter(tok for tok in tokenize("이순신은 조선 중기의 무신이다. 충무공 이순신은 임진왜란에서 조선을 승리로 이끈 명장이다.") if tok in lexicon)
vector2 = Counter(tok for tok in tokenize("불멸의 이순신은 대한민국 KBS에서 2004년 9월 4일부터 2005년 8월 28일까지 조선 충무공 이순신의 삶을 소재로 하여 방영한 드라마이다.") if tok in lexicon)
corpus = [vector1, vector2]

In [None]:
corpus

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

path = 'data/corpus_data/sentence_tokenized_wiki_20190620_small.txt'

data = open(path, 'r', encoding='utf-8')
corpus = data.readlines()

In [None]:
tfidf = TfidfVectorizer(max_features = 100, max_df=0.95, min_df=0)

In [None]:
A_tfidf_sp = tfidf.fit_transform(corpus)  #size D x V

In [None]:
tfidf_dict = tfidf.get_feature_names()
print(tfidf_dict)

In [None]:
data_array = A_tfidf_sp.toarray()
data = pd.DataFrame(data_array, columns=tfidf_dict)
data.shape

In [None]:
from sklearn.manifold import TSNE


In [None]:
tsne = TSNE(n_components=2, n_iter=10000, verbose=1)

In [None]:
Z = tsne.fit_transform(data_array.T)

In [None]:
print(Z[0:5])
print('Top words: ',len(Z))

In [None]:
path = 'data/font/NanumMyeongjo.ttf'
fontprop = fm.FontProperties(fname=path, size=10)
plt.scatter(Z[:,0], Z[:,1])
for i in range(len(tfidf_dict)):
    plt.annotate(s=tfidf_dict[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]),fontProperties=fontprop)

plt.draw()

# Step 2. Word2Vec

In [None]:
from gensim.models.word2vec import Word2Vec
import gensim

In [None]:
path = 'data/corpus_data/sentence_tokenized_wiki_20190620_small.txt'

sentences = gensim.models.word2vec.Text8Corpus(path)

model = Word2Vec(sentences, min_count=5, size=100, window=5)

In [None]:
vocabs = model.wv.vocab.keys()
print(vocabs)

In [None]:
model.save('vector_model/w2v_model')
saved_model = Word2Vec.load('vector_model/w2v_model')

In [None]:
print(saved_model.most_similar(positive=["이순신"], topn=10))
print(model.similarity('이순신', '이명박'))
print(model.similarity('이순신', '원균'))

saved_model.most_similar(positive=['대한민국', '도쿄'], negative=['서울'])

In [None]:
print(saved_model.similar_by_word('카카오톡'))

# Step 3. FastText

In [None]:
from gensim.models.fasttext import FastText
import gensim.models.word2vec
import gensim

In [None]:
path = '/data/corpus_data/wiki_20190620_small'

sentences = gensim.models.word2vec.Text8Corpus(path)

In [None]:
model = FastText(sentences, min_count=5, size=100, window=5)

In [None]:
model.save('vector_model/fasttext_model')
saved_model = FastText.load('vector_model/fasttext_model')

In [None]:
word_vector = saved_model['이순신']
print(word_vector)

In [None]:
print(saved_model.similarity('이순신', '이명박'))
print(saved_model.similarity('이순신', '원균'))

In [None]:
print(saved_model.similar_by_word('이순신'))
print(saved_model.similar_by_word('조선'))

In [None]:
saved_model.most_similar(positive=['대한민국', '도쿄'], negative=['서울'])