# 5-1. **Topic Modeling**

In [1]:
!pip install -q nltk
!pip install -q konlpy
!pip install -q gensim

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!wget https://raw.githubusercontent.com/kimtwan/NLP_lecture/master/data/MPB_minutes.tsv

--2024-05-21 02:25:23--  https://raw.githubusercontent.com/kimtwan/NLP_lecture/master/data/MPB_minutes.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15195720 (14M) [text/plain]
Saving to: ‘MPB_minutes.tsv’


2024-05-21 02:25:25 (37.3 MB/s) - ‘MPB_minutes.tsv’ saved [15195720/15195720]



In [4]:
# load Monetary Policy Board minutes text
data = pd.read_csv('MPB_minutes.tsv', encoding='utf-8', sep='\t', parse_dates=['날짜'], index_col='날짜')
# minutes after 2020
data = data.sort_index().loc['2020-01-01':]
data.head()

Unnamed: 0_level_0,회차,의사록
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-17,2020년도 제1차,2020년도 제1차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020년...
2020-02-27,2020년도 제4차,2020년도 제4차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020년...
2020-03-16,2020년도 제6차,2020년도 제6차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020년...
2020-04-09,2020년도 제8차,2020년도 제8차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020년...
2020-05-28,2020년도 제12차,2020년도 제12차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020...


In [5]:
from konlpy.tag import Kkma
from nltk import sent_tokenize
from tqdm import notebook

In [6]:
kkma = Kkma()
corpus = data['의사록'].tolist()

corpus_nouns = []
# takes aboute 9 mins
for i in notebook.tqdm(range(len(corpus))):
    # separate the minutes into sentences
    sentences = sent_tokenize(corpus[i])
    for sentence in sentences:
        # extract nouns
        nouns = kkma.nouns(sentence)
        # includes only words longer than 2 characters
        nouns = [noun for noun in nouns if len(noun) > 1]
        corpus_nouns.append(nouns)

  0%|          | 0/23 [00:00<?, ?it/s]

In [7]:
import pprint
pprint.pprint(corpus_nouns[:5])

[['2020', '2020년', '1차', '금융', '금융통화위원회', '통화', '위원회', '의사록'],
 ['2020', '2020년', '1월', '17', '17일'],
 ['금융', '금융통화위원회', '통화', '위원회', '회의실'],
 ['출석', '출석위원', '위원'],
 ['결석', '결석위원', '위원']]


In [8]:
# topic analysis using LDA
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

In [9]:
# convert sentences into a form that can be used with gensim LDA
dictionary = Dictionary(corpus_nouns)
corpus = [dictionary.doc2bow(text) for text in corpus_nouns]

In [10]:
pprint.pprint(corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(1, 1), (2, 1), (8, 1), (9, 1), (10, 1)],
 [(3, 1), (4, 1), (5, 1), (7, 1), (11, 1)],
 [(12, 1), (13, 1), (14, 1)],
 [(12, 1), (15, 1), (16, 1)]]


In [11]:
# set number of topics
num_topics = 5

# run the model
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)



In [12]:
# print topics
topics = lda.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.030*"금리" + 0.023*"통화" + 0.022*"정책" + 0.020*"금융" + 0.015*"통화정책" + 0.013*"기준" + 0.013*"기준금리" + 0.013*"상승" + 0.011*"시장" + 0.009*"위원"')
(1, '0.019*"경제" + 0.017*"소비" + 0.015*"흐름" + 0.015*"회복" + 0.012*"지속" + 0.012*"수출" + 0.011*"전망" + 0.010*"둔화" + 0.010*"경기" + 0.010*"개선"')
(2, '0.027*"관련" + 0.023*"관련부서" + 0.023*"부서" + 0.015*"답변" + 0.014*"필요" + 0.013*"고용" + 0.010*"위원" + 0.009*"취업자" + 0.007*"언급" + 0.007*"우리"')
(3, '0.025*"물가" + 0.014*"상승" + 0.013*"인플레이션" + 0.012*"경제" + 0.010*"전망" + 0.010*"영향" + 0.010*"코로나" + 0.009*"압력" + 0.009*"수준" + 0.008*"예상"')
(4, '0.016*"관련" + 0.014*"자금" + 0.013*"인상" + 0.013*"대출" + 0.013*"위원" + 0.012*"가계" + 0.012*"시장" + 0.011*"금융" + 0.010*"부서" + 0.010*"관련부서"')
