# 단어 문맥 행렬을 활용한 LSA

In [5]:
!pip install soynlp

Collecting soynlp
  Downloading soynlp-0.0.493-py3-none-any.whl (416 kB)
[?25l[K     |▉                               | 10 kB 19.2 MB/s eta 0:00:01[K     |█▋                              | 20 kB 23.1 MB/s eta 0:00:01[K     |██▍                             | 30 kB 28.0 MB/s eta 0:00:01[K     |███▏                            | 40 kB 23.2 MB/s eta 0:00:01[K     |████                            | 51 kB 18.1 MB/s eta 0:00:01[K     |████▊                           | 61 kB 17.6 MB/s eta 0:00:01[K     |█████▌                          | 71 kB 15.6 MB/s eta 0:00:01[K     |██████▎                         | 81 kB 16.6 MB/s eta 0:00:01[K     |███████                         | 92 kB 17.6 MB/s eta 0:00:01[K     |███████▉                        | 102 kB 18.1 MB/s eta 0:00:01[K     |████████▋                       | 112 kB 18.1 MB/s eta 0:00:01[K     |█████████▍                      | 122 kB 18.1 MB/s eta 0:00:01[K     |██████████▏                     | 133 kB 18.1 MB/s eta 0

In [1]:
!cp '/content/drive/MyDrive/Colab Notebooks/for-lsa-mecab.txt' ./

In [2]:
corpus_fname = 'for-lsa-mecab.txt'

In [3]:
corpus = [sent.replace('\n', '').strip() for sent in open(corpus_fname, 'r').readlines()]

In [10]:
from soynlp.vectorizer import sent_to_word_contexts_matrix
from sklearn.decomposition import TruncatedSVD

In [8]:
# construct co-occurrence matrix (=word_context)
# dynamic weight if True. co-occurrence weight = [1, (w-1)/w, (w-2)/w, ... 1/w]
input_matrix, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    dynamic_weight=True,
    verbose=True)

Create (word, contexts) matrix
  - counting word frequency from 276937 sents, mem=0.231 Gb
  - scanning (word, context) pairs from 276937 sents, mem=0.871 Gb
  - (word, context) matrix was constructed. shape = (26618, 26618)                    
  - done


In [11]:
# compute truncated SVD
cooc_svd = TruncatedSVD(n_components=100)

In [12]:
cooc_vecs = cooc_svd.fit_transform(input_matrix)

In [14]:
# 파일로 저장
with open('lsa' + "-cooc.vecs", 'w') as f1:
    for word, vec in zip(idx2vocab, cooc_vecs):
        str_vec = [str(el) for el in vec]
        f1.writelines(word + ' ' + ' '.join(str_vec) + "\n")

# PPMI 행렬에 LSA를 적용

In [17]:
from soynlp.word import pmi
import math

In [18]:
# Shift PPMI at k=0, (equal PPMI)
# pmi(word, contexts)
# px: Probability of rows(items)
# py: Probability of columns(features)
pmi_matrix, _, _ = pmi(input_matrix, min_pmi=math.log(5))

In [19]:
# compute truncated SVD
pmi_svd = TruncatedSVD(n_components=100)

In [20]:
pmi_vecs = pmi_svd.fit_transform(input_matrix)

In [21]:
with open('lsa' + "-pmi.vecs", 'w') as f2:
    for word, vec in zip(idx2vocab, pmi_vecs):
        str_vec = [str(el) for el in vec]
        f2.writelines(word + ' ' + ' '.join(str_vec) + "\n")

# 유사도 비교

In [24]:
vecs_fname = 'lsa-cooc.vecs'

In [25]:
words, vecs = [], []
with open(vecs_fname, 'r', encoding='utf-8') as f:
  for line in f:
      splited_line = line.strip().split(" ")
      words.append(splited_line[0])
      vec = [float(el) for el in splited_line[1:]]
      vecs.append(vec)

In [26]:
from sklearn.preprocessing import normalize
unit_vecs = normalize(vecs, norm='l2', axis=1)

In [27]:
dictionary = {}
for word, vec in zip(words, unit_vecs):
    dictionary[word] = vec

In [45]:
import numpy as np

def most_similar(query_vec, topn=10):
    query_vec_norm = np.linalg.norm(query_vec)
    query_unit_vec = query_vec
    # if query_vec_norm != 0:
    #     query_unit_vec = query_vec / query_vec_norm
    # else:
    #     query_unit_vec = query_vec
    scores = np.dot(vecs, query_unit_vec)
    topn_candidates = sorted(zip(words, scores), key=lambda x: x[1], reverse=True)[1:topn+1]
    return topn_candidates

In [46]:
query = dictionary['희망']
most_similar(query, topn=5)

[('다', 50246.77408923838),
 ('는', 40341.73320706481),
 ('하', 36011.66956180306),
 ('이', 34064.59752400215),
 ('의', 27495.90930679098)]