In [1]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import time

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
dev

device(type='cuda')

In [2]:
from preprocess import get_tokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
corpus_fname = 'data/processed/processed_blog.txt'
tokenizer = get_tokenizer('mecab')

In [4]:
titles, raw_corpus, noun_corpus = [], [], []
with open(corpus_fname, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            title, document = line.strip().split('\u241E')
            titles.append(title)
            raw_corpus.append(document)
            nouns = tokenizer.nouns(document)
            noun_corpus.append(' '.join(nouns))
        except:
            continue
        

In [5]:
raw_corpus[0][:300]

' 이번 글에서는 최대엔트로피모델(Maximum Entropy model)의 파라메터 추정을 살펴보도록 하겠습니다. 이 글은 기본적으로 [이곳]()을 참고하였습니다. 그럼 시작하겠습니다.   ## 모델 정의 최대엔트로피 모델은 다음과 같이 정의됩니다.  $$ { P }_{ \\Lambda }(y|x)=\\frac { { exp( }\\sum _{ i }^{ }{ { \\lambda }_{ i }{ f }_{ i }\\left( x,y \\right) } ) }{ \\sum _{ y }^{ }{ { exp( }\\sum _{ i }^{ }{ { \\l'

In [6]:
noun_corpus[0][:100]

'이번 글 최대 엔트로피 모델 파라 메터 추정 글 기본 이곳 참고 시작 모델 정의 최대 엔트로피 모델 다음 정의 위 식 때 값 반환 함수 자질 벡터 번 값 중요 가중치 요소 가중치 '

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    min_df=1, # document frequency가 1 이상
    ngram_range=(1,1), # 좌우 (1,1)개씩 고려
    lowercase=True,
    tokenizer=lambda x: x.split())
input_matrix = vectorizer.fit_transform(noun_corpus)

In [26]:
input_matrix

<204x37143 sparse matrix of type '<class 'numpy.float64'>'
	with 76870 stored elements in Compressed Sparse Row format>

In [8]:
input_matrix.shape # 행은 문서, 열은 단어에 대응

(204, 37143)

In [9]:
len(vectorizer.vocabulary_)

37143

In [10]:
id2vocab = {vectorizer.vocabulary_[token]:token
           for token in vectorizer.vocabulary_.keys()}
# curr_doc : 말뭉치 첫 번째 문서의 TF-IDF 행렬
curr_doc, result = input_matrix[0], []

In [21]:
curr_doc.indices[:10], curr_doc.data[:10]

(array([30054, 21719, 17148, 26014, 33661, 20188, 19879, 23540, 22470,
        27861], dtype=int32),
 array([0.02321873, 0.06195969, 0.05037386, 0.06943096, 0.08838695,
        0.03324375, 0.03535159, 0.09099347, 0.02587359, 0.03601874]))

In [11]:
# curr_doc에서 TF-IDF값이 0이 아닌 요소들을 내림차순 정렬
for idx, el in zip(curr_doc.indices, curr_doc.data):
    result.append((id2vocab[idx], el))

In [30]:
result[:10]

[('점', 0.02321872527560292),
 ('뺄셈', 0.06195969255574101),
 ('덧셈', 0.05037385883988076),
 ('업데이트', 0.06943095628902349),
 ('터', 0.08838695358356398),
 ('방향', 0.033243751333481585),
 ('반대', 0.03535158640207159),
 ('손실', 0.09099347000775752),
 ('생각', 0.02587358775480135),
 ('유사', 0.036018736356608516)]

In [31]:
sorted(result, key=lambda x:x[1], reverse=True)[:10]

[('우도', 0.30935433754247393),
 ('최대', 0.2644197269001561),
 ('모델', 0.21509543930315736),
 ('디언', 0.20954601175351925),
 ('엔트로피', 0.20954601175351925),
 ('트', 0.2020801317026838),
 ('메터', 0.18998546457990625),
 ('파라', 0.18998546457990625),
 ('확률분포', 0.17931834019736734),
 ('디센트', 0.1740779030970291)]

In [34]:
titles[2]

'word2vec'

In [36]:
input_matrix[0]

<1x37143 sparse matrix of type '<class 'numpy.float64'>'
	with 106 stored elements in Compressed Sparse Row format>

In [37]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100)
vecs = svd.fit_transform(input_matrix)

In [39]:
vecs.shape

(204, 100)