In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
import konlpy
import re

def tokenize_korean_text(text):
    text = re.sub(r'[^,.?!\w\s]','', text)
    
    okt = konlpy.tag.Okt()
    Okt_morphs = okt.pos(text)
    
    words = []
    ex_li = ['거래', '편입', '특징', '고수', '주식', '코스피', '전일', '대비', '이평', '검색', '랭킹', '이상', '매수', '시각', '시작', '로부터']
    for word, pos in Okt_morphs:
        if pos == 'Noun' and (word not in ex_li):
            words.append(word)

    words_str = ' '.join(words)
    return words_str


tokenized_list = []

for text in df['info']:
    tokenized_list.append(tokenize_korean_text(text))


In [None]:
# 적게 포함되는 단어 제외
drop_corpus = []

for index in range(len(tokenized_list)):
    corpus = tokenized_list[index]
    if len(set(corpus.split())) < 3:
        df.drop(index, axis='index', inplace=True)
        drop_corpus.append(corpus)
    
for corpus in drop_corpus:
    tokenized_list.remove(corpus)

df.reset_index(drop=True, inplace=True)

In [None]:
count_vectorizer = CountVectorizer(max_features=1000, ngram_range=(1,2))

feat_vect = count_vectorizer.fit_transform(tokenized_list)

In [None]:
# 토픽 개수 설정 후 LDA 적용
lda = LatentDirichletAllocation(n_components=5)
lda.fit(feat_vect)

In [None]:
# 토픽 모델링 결과 보여줌
def display_topics(model, feature_names, num_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('%d 번째 토픽 추출' % (topic_index))
. 
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:num_top_words]
        
        feature_result = ' '.join([feature_names[i] for i in top_indexes])                
        print(feature_result)

feature_names = count_vectorizer.get_feature_names_out()

# 토픽 관련 키워드 추출 개수 설정
display_topics(lda, feature_names, 10)