In [1]:
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd

import gensim
import pyLDAvis
import pyLDAvis.gensim
from tqdm.auto import tqdm

tqdm.pandas()
pyLDAvis.enable_notebook()

  from pandas import Panel


In [2]:
DATA_TYPE = 'full-morphs-jamo'
DATA_NOSTOPS_TYPE = 'full-nouns-jamo'

TOPIC = 40

if DATA_TYPE.startswith('outer'):
    TEXT_COLUMNS = ['headline', 'lede']
elif DATA_TYPE.startswith('inner'):
    TEXT_COLUMNS = ['title', 'contents']
else:
    TEXT_COLUMNS = ['headline', 'lede', 'contents']

In [3]:
TOKENIZING_DIR = '3-tokenizing'
EMBEDDING_DIR = '4-embedding'
DATA_DIR = 'lol'

tokenizing_dir = Path(TOKENIZING_DIR) / DATA_DIR
embedding_dir = Path(EMBEDDING_DIR) / DATA_DIR
embedding_dir.mkdir(parents=True, exist_ok=True)

### tokenize

In [4]:
def get_sentences(df):
    data_words = []
    for i, *rows in tqdm(df[TEXT_COLUMNS].itertuples(), total=len(df)):
        rows = [x for x in rows if isinstance(x, str)]
        data_words.append(' '.join(rows).split(' '))
    return data_words

### bigram

In [5]:
df = pd.read_csv(tokenizing_dir / 'news-{}.csv'.format(DATA_TYPE)).dropna()
contents = [x.split(' ') for x in df.contents]

In [6]:
bigram = gensim.models.Phrases(contents)
bigram_mod = gensim.models.phrases.Phraser(bigram)

trigram = gensim.models.Phrases(bigram[contents])
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
def make_bigrams(texts):
    return [bigram_mod[doc] if doc is not np.nan else doc for doc in tqdm(texts)]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] if doc is not np.nan else doc for doc in tqdm(texts)]

In [8]:
bigram_mod_path = embedding_dir / 'news-{}-bigram.bin'.format(DATA_TYPE)
bigram_mod.save(str(bigram_mod_path))

trigram_mod_path = embedding_dir / 'news-{}-trigram.bin'.format(DATA_TYPE)
trigram_mod.save(str(trigram_mod_path))

In [9]:
nostops_df = pd.read_csv(tokenizing_dir / 'news-{}.csv'.format(DATA_NOSTOPS_TYPE))
title_nostops = [x.split(' ') if x is not np.nan else x for x in nostops_df.title]
lede_nostops = [x.split(' ') if x is not np.nan else x for x in nostops_df.lede]
contents_nostops = [x.split(' ') if x is not np.nan else x for x in nostops_df.title]

title_bigrams = make_bigrams(title_nostops)
lede_bigrams = make_bigrams(lede_nostops)
contents_bigrams = make_bigrams(contents_nostops)

#data_words_trigrams = make_trigrams(data_words_nostops)

HBox(children=(FloatProgress(value=0.0, max=51816.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51816.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51816.0), HTML(value='')))




### corpus

In [10]:
id2word = gensim.corpora.Dictionary(data_words_bigrams)
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]]


### topic

In [11]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=TOPIC,
    random_state=119,
    update_every=1,
    alpha='auto',
    per_word_topics=True
)

In [12]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(1,
  '0.231*"원거리_딜러" + 0.075*"뱅" + 0.063*"배준식" + 0.058*"중계" + 0.042*"상황" + '
  '0.035*"오전" + 0.034*"일" + 0.022*"동시" + 0.018*"달" + 0.017*"유리"'),
 (32,
  '0.104*"말" + 0.075*"이후" + 0.054*"박종익" + 0.054*"투신" + 0.053*"프레이" + '
  '0.050*"선승" + 0.043*"김종인" + 0.029*"그" + 0.024*"시드" + 0.023*"오"'),
 (27,
  '0.073*"연속" + 0.040*"주도" + 0.036*"터키" + 0.034*"관심" + 0.033*"박" + 0.032*"출신" '
  '+ 0.031*"의미" + 0.030*"코" + 0.028*"흐름" + 0.026*"희망"'),
 (18,
  '0.138*"서포터" + 0.096*"자신" + 0.089*"오후_서울" + 0.053*"엑_스포츠" + 0.050*"션" + '
  '0.049*"일" + 0.034*"차이" + 0.033*"강_찬용" + 0.031*"소감" + 0.029*"마타_조세"'),
 (4,
  '0.206*"온라인" + 0.076*"압도" + 0.058*"스틸" + 0.055*"최근" + 0.052*"위치" + '
  '0.048*"맹활약" + 0.046*"진행" + 0.044*"챔피언십" + 0.040*"이유" + 0.038*"속"'),
 (8,
  '0.065*"후" + 0.061*"연승_질주" + 0.040*"윤" + 0.038*"스피릿" + 0.038*"다음" + '
  '0.026*"영상" + 0.026*"날" + 0.024*"아쉬움" + 0.022*"일" + 0.021*"더블_리프트"'),
 (0,
  '0.160*"경기력" + 0.100*"플리커" + 0.049*"오후" + 0.045*"실수" + 0.041*"시" + '
  '0.041*"결정" + 0.040*"젠" + 0.038*"팀" +

### score

In [13]:
print('Perplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.

coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -15.71150994487184
Coherence Score:  0.3558041773977927


### visualize

In [15]:
vis_path = embedding_dir / 'news-{}-{}-lda-topic{}-visualization.html'.format(DATA_NOSTOPS_TYPE, '-'.join(TEXT_COLUMNS), TOPIC)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, open(vis_path, 'w'))
#vis

In [16]:
lda_model_path = embedding_dir / 'news-{}-{}-lda-topic{}.bin'.format(DATA_NOSTOPS_TYPE, '-'.join(TEXT_COLUMNS), TOPIC)
lda_model.save(str(lda_model_path))