In [1]:
import pandas as pd
df = pd.read_csv('../dataset/neurips.csv')

In [2]:
df.head()

Unnamed: 0,year,title,abstract
0,2007,Competition Adds Complexity,It is known that determinining whether a DEC-P...
1,2007,Efficient Principled Learning of Thin Junction...,We present the first truly polynomial algorith...
2,2007,Regularized Boost for Semi-Supervised Learning,Semi-supervised inductive learning concerns ho...
3,2007,Simplified Rules and Theoretical Analysis for ...,We show that under suitable assumptions (prima...
4,2007,Predicting human gaze using low-level saliency...,"Under natural viewing conditions, human observ..."


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
cv = TfidfVectorizer(stop_words='english', max_features=2000)
x = cv.fit_transform(df.abstract)

In [5]:
words = cv.get_feature_names()

In [6]:
#!pip install gensim

In [7]:
from gensim.matutils import Sparse2Corpus

In [8]:
corpus = Sparse2Corpus(x.T)

In [9]:
corpus[0]

[(1209, 0.19719051171057086),
 (1250, 0.1878641742740461),
 (265, 0.11655716303382495),
 (495, 0.21779310927686704),
 (316, 0.16096846113531893),
 (321, 0.25275451313424946),
 (317, 0.3778227141887415),
 (1573, 0.35429796717697476),
 (650, 0.32556474378711336),
 (1346, 0.3395230603787923),
 (1759, 0.32398854042493375),
 (767, 0.18302469761238765),
 (1756, 0.1205808084872785),
 (1219, 0.2021297737105109),
 (1282, 0.1848866146903614),
 (984, 0.2396176930939823)]

In [10]:
id2token = dict(enumerate(words))

In [11]:
id2token[9]

'abstract'

## Corpus 형식으로 바로 변환

In [13]:
import re
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [18]:
token_re = re.compile(r'\b\w{2,}\b', re.UNICODE)
def tokenizer(text):
    text = text.lower()
    words = []
    for word in token_re.findall(text):
        if word not in ENGLISH_STOP_WORDS:
            words.append(word)
    return words

In [21]:
docs = []
for text in df.abstract:
    doc = tokenizer(text)
    docs.append(doc)

In [22]:
from gensim.corpora.dictionary import Dictionary

In [23]:
dic = Dictionary(docs)

In [24]:
dic.filter_extremes(no_below=10, no_above=0.9)

In [25]:
dic[100]

'existing'

In [27]:
corpus = []
for doc in docs:
    bow = dic.doc2bow(doc) # bag of words
    corpus.append(bow)

In [28]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 2),
 (4, 2),
 (5, 2),
 (6, 1),
 (7, 2),
 (8, 1),
 (9, 2),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 2),
 (16, 2),
 (17, 1),
 (18, 2)]

## LDA

In [29]:
from gensim.models.ldamodel import LdaModel

In [30]:
from sklearn.model_selection import train_test_split
train_corpus, valid_corpus = train_test_split(corpus, test_size=0.1, random_state=5432)

In [32]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [33]:
model = LdaModel(corpus=train_corpus, id2word=dic, num_topics=100, random_state=1234)

In [34]:
loss = model.log_perplexity(valid_corpus)
loss

-20.28821274823097

In [35]:
import numpy
old_loss = -numpy.inf
while loss > old_loss + 0.1:
    model.update(train_corpus)
    old_loss = loss
    loss = model.log_perplexity(valid_corpus)
    print(loss)

-18.415288648947133
-17.327420555230482
-16.73191115961773
-16.387196635540032
-16.17544340264779
-16.031293237287528
-15.928734210093378
-15.85115408772109


In [36]:
model.save('lda-model')

In [44]:
#!zip mylda.zip lda-model*

In [42]:
model = LdaModel.load('lda-model')

## LDA 결과 보기

In [47]:
model.show_topic(2)

[('deep', 0.08321178),
 ('networks', 0.06272127),
 ('neural', 0.050840236),
 ('convolutional', 0.03355563),
 ('network', 0.0319625),
 ('training', 0.02923743),
 ('art', 0.020741098),
 ('state', 0.019630505),
 ('performance', 0.018863246),
 ('layers', 0.017408974)]

In [48]:
dic.token2id['topic']

307

In [49]:
model.get_term_topics(307, 0.01)

[(8, 0.07108493)]

In [50]:
model.show_topic(8)

[('topic', 0.07110646),
 ('model', 0.04465457),
 ('latent', 0.03127821),
 ('dirichlet', 0.03098323),
 ('topics', 0.02813611),
 ('word', 0.025699541),
 ('models', 0.025632521),
 ('document', 0.025331365),
 ('lda', 0.025201213),
 ('words', 0.024289062)]

In [51]:
new_text = '''We describe latent Dirichlet allocation (LDA), a generative probabilistic model for collections of
discrete data such as text corpora. LDA is a three-level hierarchical Bayesian model, in which each
item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in
turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of
text modeling, the topic probabilities provide an explicit representation of a document. We present
efficient approximate inference techniques based on variational methods and an EM algorithm for
empirical Bayes parameter estimation. We report results in document modeling, text classification,
and collaborative filtering, comparing to a mixture of unigrams model and the probabilistic LSI
model.'''

In [52]:
doc = tokenizer(new_text)
bow = dic.doc2bow(doc)

In [53]:
model.get_document_topics(bow)

[(1, 0.09346718),
 (8, 0.44146907),
 (12, 0.06770127),
 (14, 0.08519129),
 (22, 0.06444722),
 (44, 0.017260868),
 (54, 0.06176714),
 (78, 0.05043004),
 (96, 0.10544732)]

## LDAvis를 통한 결과 시각화

In [55]:
#!pip install pyLDAvis #==2.1.2

In [56]:
import pyLDAvis.gensim

In [57]:
pyLDAvis.enable_notebook()

  and should_run_async(code)


In [58]:
p = pyLDAvis.gensim.prepare(model, corpus, dic, sort_topics=False)

  and should_run_async(code)


In [59]:
pyLDAvis.display(p)

  and should_run_async(code)


In [None]:
# 0.6 정도 권장 - 하이퍼파라미터

## 응집도와 다양도 계산

In [60]:
# 응집도
from gensim.models import CoherenceModel

  and should_run_async(code)


In [61]:
coh = CoherenceModel(model=model, corpus=corpus, texts=docs, dictionary=dic, coherence='c_v')

  and should_run_async(code)


In [62]:
coh.get_coherence()

  and should_run_async(code)


0.3552873729435981

In [63]:
# 다양도
topn = 25
top_words = set()

for topic in range(model.num_topics):
    for word, prob in model.show_topic(topic, topn=topn):
        top_words.add(word)

  and should_run_async(code)


In [66]:
len(top_words)

  and should_run_async(code)


1067

In [67]:
1067 / 2500

  and should_run_async(code)


0.4268