# 5-1. **Topic Modeling**

In [None]:
!pip install -q nltk
!pip install -q konlpy
!pip install -q gensim

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!wget https://raw.githubusercontent.com/kimtwan/NLP_lecture/master/data/MPB_minutes.tsv

--2023-10-17 21:30:39--  https://raw.githubusercontent.com/kimtwan/NLP_lecture/master/data/MPB_minutes.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15195720 (14M) [text/plain]
Saving to: ‘MPB_minutes.tsv’


2023-10-17 21:30:40 (168 MB/s) - ‘MPB_minutes.tsv’ saved [15195720/15195720]



In [None]:
# load Monetary Policy Board minutes text
data = pd.read_csv('MPB_minutes.tsv', encoding='utf-8', sep='\t', parse_dates=['날짜'], index_col='날짜')
# minutes after 2020
data = data.sort_index().loc['2020-01-01':]
data.head()

  data = data.loc['2020-01-01':]


Unnamed: 0_level_0,회차,의사록
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-28,2020년도 제12차,2020년도 제12차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020...
2020-07-16,2020년도 제15차,2020년도 제15차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020...
2020-08-27,2020년도 제19차,2020년도 제19차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020...
2020-01-17,2020년도 제1차,2020년도 제1차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020년...
2020-10-14,2020년도 제22차,2020년도 제22차\n금융통화위원회 의사록\n한 국 은 행\n1. 일 자 2020...


In [None]:
from konlpy.tag import Kkma
from nltk import sent_tokenize
from tqdm import notebook

In [None]:
kkma = Kkma()
corpus = data['의사록'].tolist()

corpus_nouns = []
for i in notebook.tqdm(range(len(corpus))):
    # separate the minutes into sentences
    sentences = sent_tokenize(corpus[i])
    for sentence in sentences:
        # extract nouns
        nouns = kkma.nouns(sentence)
        # includes only words longer than 2 characters
        nouns = [noun for noun in nouns if len(noun) > 1]
        corpus_nouns.append(nouns)

  0%|          | 0/23 [00:00<?, ?it/s]

In [None]:
import pprint
pprint.pprint(corpus_nouns[:5])

[['2020', '2020년', '12', '12차', '금융', '금융통화위원회', '통화', '위원회', '의사록'],
 ['2020', '2020년', '5월', '28', '28일'],
 ['금융', '금융통화위원회', '통화', '위원회', '회의실'],
 ['출석', '출석위원', '위원'],
 ['결석', '결석위원', '위원']]


In [None]:
# topic analysis using LDA
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

In [None]:
# convert sentences into a form that can be used with gensim LDA
dictionary = Dictionary(corpus_nouns)
corpus = [dictionary.doc2bow(text) for text in corpus_nouns]

In [None]:
pprint.pprint(corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (3, 1), (9, 1), (10, 1), (11, 1)],
 [(4, 1), (5, 1), (6, 1), (8, 1), (12, 1)],
 [(13, 1), (14, 1), (15, 1)],
 [(13, 1), (16, 1), (17, 1)]]


In [None]:
# set number of topics
num_topics = 10

# run the model
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)



In [None]:
# print topics
topics = lda.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.023*"코로나" + 0.022*"경제" + 0.020*"19" + 0.019*"코로나19" + 0.017*"확산" + 0.016*"국내" + 0.016*"소비" + 0.014*"국내경제" + 0.012*"완화" + 0.011*"회복"')
(1, '0.033*"관련" + 0.029*"위원" + 0.025*"관련부서" + 0.025*"부서" + 0.018*"가계" + 0.017*"언급" + 0.015*"필요" + 0.015*"대출" + 0.012*"답변" + 0.011*"견해"')
(2, '0.020*"금리" + 0.016*"인상" + 0.016*"영향" + 0.014*"시장" + 0.013*"상승" + 0.013*"경제" + 0.011*"경기" + 0.010*"주요국" + 0.010*"미국" + 0.009*"우리"')
(3, '0.018*"기업" + 0.016*"관련" + 0.015*"대출" + 0.014*"금리" + 0.013*"부서" + 0.013*"관련부서" + 0.013*"자금" + 0.012*"은행" + 0.012*"외환" + 0.011*"금융"')
(4, '0.025*"인플레이션" + 0.019*"기대인플레이션" + 0.018*"지속" + 0.014*"기대" + 0.013*"증가" + 0.012*"서비스" + 0.011*"중심" + 0.011*"취업자수" + 0.010*"경제" + 0.010*"흐름"')
(5, '0.020*"전망" + 0.020*"대출" + 0.018*"금년" + 0.017*"증가" + 0.017*"은행" + 0.015*"금리" + 0.014*"수준" + 0.013*"지속" + 0.013*"확대" + 0.012*"예상"')
(6, '0.022*"물가" + 0.015*"상승" + 0.014*"금융" + 0.014*"가격" + 0.013*"필요" + 0.011*"시장" + 0.011*"상황" + 0.010*"경제" + 0.009*"있음" + 0.009*"요인"')
(7, '0.046*"통화" + 0.044*"정책" + 0.