In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [2]:
# for colab
"""!pip install konlpy
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
!cd Mecab-ko-for-Google-Colab
!bash /content/Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab_light_220429.sh
"""

'!pip install konlpy\n!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git\n!cd Mecab-ko-for-Google-Colab\n!bash /content/Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab_light_220429.sh\n'

In [3]:
from konlpy.tag import Mecab
tagger = Mecab(dicpath=r"C:/mecab/mecab-ko-dic")

In [4]:
yymm = "2401"

file_path = '../data/'  # 파일 경로를 지정하세요
fn= f'epic_metadata_{yymm}.xlsx'

df = pd.read_excel(file_path+fn)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,자료명,발간일,발간처,요약
0,246696,2024년 2월 물가연동국고채 종목별 연동계수,2024.01.02,기획재정부 국고국 국채과,기획재정부는 1.2.(화) 2024년 2월 물가연동국고채 종목별 연동계수를 발표하였...
1,246697,「2024년부터 이렇게 달라집니다」 책자 발간,2023.12.28,기획재정부 기획조정실 혁신정책담당관,기획재정부는 12.31.(일) 2024년부터 달라지는 제도와 법규사항 등을 알기 쉽...
2,246698,국세물납증권 56개 종목 공개매각 실시,2024.01.02,기획재정부 국고국 출자관리과,정부는 국유재산정책심의위원회에서 의결한 「2023년도 제3차 국세물납증권 매각 예정...
3,246699,"조달청, 1월 대형사업 총 163건, 1조 8,901억 원 상당 입찰 예정",2024.01.02,조달청,"조달청은 1.2.(화) ’24년 1월 한 달 동안 총 163건 1조 8,901억 원..."
4,246700,인구감소지역 『생활인구』 시범산정 결과 공표,2024.01.01,통계청,통계청과 행정안전부은 1.1.(월) 7개의 인구감소지역에 대해 「생활인구」를 시범 ...


In [6]:
stop_words = "안 간 붙임 참고 참조 첨부 총리 장관 겸 청 실 는 은 가 등 원 조 개 이 저 다만 하지만 그러나 중 억 년 월 화 수 목 금 토 일 것 줄 만 건 또한 아울러 그리고 전년 금년 이번 올해 내년 기획 재정부 첨부 파일 내용 발표 위원회 감독원 부"
stop_words = stop_words.split(" ")

In [7]:
excluson = ["기획재정부","정부",'과학기술정보통신부','과기정통부','농림축산식품부','농식품부','금융위원회','금융위','금융감독원','금감원','산업통상자원부','산업부',
            '환경부','해양수산부','해수부','공정거래위원회','공정위','식약처','식품의약품안전처','고용노동부','고용부','국토교통부','국토부','중소벤처기업부','중기부',
            '통계청','국세청','관세청','조달청','특허청','통일부','보건복지부','복지부','교육부','한국은행',
            '계획임.','예정임.','밝혔다.','발표하였다.']

In [8]:
summary_list = df['요약'].tolist()

documents_list = []
for summary in summary_list:
    document = []
    for ex in excluson:
        summary = summary.replace(ex,'')
    for noun in tagger.nouns(summary):
        if noun not in stop_words:
            document.append(noun)
    documents_list.append(document)

In [9]:
documents_list[:2]

[['물가', '연동', '국고', '종목별', '연동', '계수'],
 ['제도',
  '법규',
  '사항',
  '정리',
  '책자',
  '발간',
  '책자',
  '기관',
  '정책',
  '분야',
  '시기',
  '기관',
  '구성',
  '주요',
  '이해',
  '삽화',
  '제시',
  '청년',
  '여성',
  '부모',
  '다문화가정',
  '신혼',
  '부부',
  '계층',
  '특화',
  '정책',
  '분야',
  '제시',
  '한편',
  '사회',
  '안전',
  '탄소',
  '중립',
  '교육',
  '지원',
  '관련',
  '제도',
  '변경',
  '다수',
  '포함',
  '책자',
  '초',
  '지방',
  '자치',
  '단체',
  '공공',
  '도서관',
  '점자',
  '도서관',
  '권',
  '배포',
  '비치',
  '온라인',
  '공개',
  '분야',
  '주요',
  '삽화']]

In [10]:
documents = [' '.join(words) for words in documents_list]

In [11]:
documents[:2]

['물가 연동 국고 종목별 연동 계수',
 '제도 법규 사항 정리 책자 발간 책자 기관 정책 분야 시기 기관 구성 주요 이해 삽화 제시 청년 여성 부모 다문화가정 신혼 부부 계층 특화 정책 분야 제시 한편 사회 안전 탄소 중립 교육 지원 관련 제도 변경 다수 포함 책자 초 지방 자치 단체 공공 도서관 점자 도서관 권 배포 비치 온라인 공개 분야 주요 삽화']

In [12]:
vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(documents)

In [13]:
X.toarray().shape

(1017, 1000)

1017개의 문서,
1000개의 단어

## LSA

In [14]:
n_topics = 20

In [15]:
svd_model = TruncatedSVD(n_components=n_topics, algorithm='randomized', n_iter=100, random_state=123)
svd_model.fit(X)
svd_model.components_.shape

(20, 1000)

`svd_model.components_`는 $V^T (t \times 단어수)$

In [16]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vectorizer.get_feature_names_out()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)

Topic 1: [('지원', 0.26487), ('사업', 0.24845), ('기업', 0.18016), ('기술', 0.1589), ('산업', 0.12194)]
Topic 2: [('사업', 0.29715), ('기술', 0.27568), ('기업', 0.206), ('연구', 0.2006), ('개발', 0.19139)]
Topic 3: [('금융', 0.29189), ('대비', 0.27157), ('증가', 0.26608), ('대출', 0.21259), ('감소', 0.19167)]
Topic 4: [('대비', 0.20586), ('증가', 0.18783), ('기술', 0.18045), ('감소', 0.16739), ('제품', 0.16043)]
Topic 5: [('지역', 0.2892), ('사업', 0.2349), ('대비', 0.16429), ('인구', 0.15615), ('지원', 0.15557)]
Topic 6: [('제품', 0.29716), ('기업', 0.26778), ('수출', 0.22979), ('지원', 0.2009), ('식품', 0.18841)]
Topic 7: [('서비스', 0.22834), ('증가', 0.19247), ('대비', 0.17882), ('정보', 0.17875), ('제품', 0.14022)]
Topic 8: [('의료', 0.46571), ('교육', 0.27973), ('마약', 0.15343), ('창업', 0.14768), ('정책', 0.13304)]
Topic 9: [('기업', 0.25974), ('개정', 0.24603), ('창업', 0.17741), ('개정안', 0.1755), ('시행령', 0.16618)]
Topic 10: [('교육', 0.39035), ('안전', 0.23003), ('창업', 0.2251), ('지식', 0.12578), ('점검', 0.12563)]
Topic 11: [('사업', 0.26381), ('의료', 0.24639), ('창업', 0.2

## LDA

In [17]:
from gensim.models.ldamodel import LdaModel
from gensim.models.callbacks import CoherenceMetric
from gensim import corpora
from gensim.models.callbacks import PerplexityMetric

  "class": algorithms.Blowfish,


In [18]:
dictionary = corpora.Dictionary(documents_list)
dictionary.filter_extremes(no_below = 5) #n회 이하로 등장한 단어는 삭제
texts = documents_list
corpus=[dictionary.doc2bow(text) for text in texts]

In [19]:
num_topics = n_topics
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

In [20]:
temp = dictionary[0]
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [21]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

Average topic coherence: -3.5492.


In [22]:
import pickle
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [23]:
lda_visualization = gensimvis.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(lda_visualization, 'lda_epic.html')

## KoBERTopic

In [25]:
from tqdm import tqdm
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing



In [26]:
train_contextualized_embeddings = bert_embeddings_from_list(documents,
                                                            "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)
  self.comm = Comm(**args)


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

KeyboardInterrupt: 