### 필요한 라이브러리 설치

In [1]:
!pip install pdfminer.six

In [2]:
!pip install konlpy
!curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

## 1. 데이터 불러오기

In [32]:
import pdfminer
from pdfminer.high_level import extract_text
from glob import glob

dataname = input("저장할 파일들의 네이밍 규칙을 적어주세요 : ")
# 17498_2 자리에 원하는 pdf 파일 이름을 넣어주세요!
pdfList = glob("../input/kdidata/*.pdf")
documents = []
print(pdfList)

for data_path in pdfList:
    text = extract_text(data_path)

    # 지저분한 텍스트 간단한 전처리
    text = text.replace("\x0c", "")
    text = text.replace('\x00', "")
    text = text.replace("\n", "")
    text = text.replace('·', "")
    documents.append(text)
    
documents

In [33]:
len(documents)

## 2. 전처리 함수 정의

In [34]:
import re
from konlpy.tag import Mecab; mecab = Mecab()

# predefined의 경우 필요한 POS를 여기에 추가하시면 됩니다.
KOR_POS = ["NNP"] # Korean
#####

def text_cleaning(doc):
    # 한국어를 제외한 글자를 제거하는 패턴.
    doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
    
    # 특수문자를 제거하는 패턴.
    #doc = re.sub("[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]", " ", doc)
    
    # 영문 빼고 모두 제거하는 패턴.
    #doc = doc.replace("\n", " ")
    
    return doc

def define_stopwords(path):
    
    SW = set()
    # 불용어를 추가하는 방법 1.
    # SW.add("있다")
    SW.add('NA')
    SW.add("영어")
    SW.add("한국어")
    
    # 불용어를 추가하는 방법 2.
    # stopwords-ko.txt에 직접 추가
    
    with open(path, encoding="utf-8") as f:
        for word in f:
            SW.add(word.strip())
            
    return SW

def text_tokenizing(doc, tokenizer):
    """
    Input Parameter :
    
    doc - tokenizing 하는 실제 데이터.
    tokenizer - token의 단위.
    language - "kor"(한글) or "eng"(영어)
    
    """
    # 형태소 분석 결과를 return.
    if tokenizer == "noun":
        return [word for word in mecab.nouns(doc) if word not in SW and len(word) > 1]    

    elif tokenizer == "morph":
        return [word for word in mecab.morphs(doc) if word not in SW and len(word) > 1]

    elif tokenizer == "predefined":
        temp = mecab.pos(doc)
        # predefined에 정의된 POS만 불러옵니다.
        tokens = [token[0] for token in temp if token[1] in KOR_POS]
        return [word for word in tokens if word not in SW and len(word) > 1]

    elif tokenizer == "word":
        return [word for word in doc.split() if word not in SW and len(word) > 1]

### 불러온 데이터를 품사 태그를 붙여서 토크나이징합니다.

In [35]:
import pickle
import os
from pprint import pprint
from tqdm.notebook import tqdm


SW = define_stopwords("../input/korean-text-analysis/stopwords-ko.txt")


if os.path.exists(f'tokenized_set({dataname}).pk'):
    with open(f'tokenized_set({dataname}).pk', "rb") as f:
        tokenized_text = pickle.load(f)
        
else:
    cleaned_text = [text_cleaning(doc) for doc in documents]
    # text_tokenizing 함수에는 (데이터, 품사, 언어) 정보가 들어가야 한다.
    # 만약에 여러 개의 품사를 사용하려면, 품사에 "predefined" 를 넣으면 된다.
    tokenized_text = [text_tokenizing(doc, "noun") for doc in cleaned_text]
        
    print("Cleaned Corpus : ", cleaned_text[0])
    
    with open(f'tokenized_set({dataname}).pk', "wb") as f:
        pickle.dump(tokenized_text, f)

    with open(f"tokenized_text({dataname}).txt", 'w') as f:
        for doc in tokenized_text:
            print(" ".join(doc), file=f)
        
print("\n\nTokenized Corpus : ", tokenized_text[0])

In [36]:
len(tokenized_text)

## 3. 빈도 분석하기.

In [37]:
from collections import Counter

total_tokens = [token for doc in tokenized_text for token in doc]
print("Number of Total tokens : ", len(total_tokens))

# 각 token 별로 빈도를 계산해주는 Counter 객체.
token_counter = Counter(total_tokens)

wordInfo = dict()
print("\n--Token : Freq--")
for tags, counts in token_counter.most_common(50): # top 50개 출력.
    wordInfo[tags] = counts
    print ("%6s : %d" % (tags, counts))

### Histogram 그리기.

In [38]:
# 그래프를 이쁘게 그리기 위한 코드입니다. 한글 글꼴을 추가합니다.

import matplotlib as mpl  # 기본 설정 만지는 용도
import matplotlib.pyplot as plt  # 그래프 그리는 용도
import matplotlib.font_manager as fm  # 폰트 관련 용도
import seaborn as sns
mpl.rcParams['axes.unicode_minus'] = False

sys_font=fm.findSystemFonts()
print(f"sys_font number: {len(sys_font)}")
print(sys_font)

nanum_font = [f for f in sys_font if 'Nanum' in f]
print(f"nanum_font number: {len(nanum_font)}")

!apt-get update -qq
!apt-get install fonts-nanum* -qq

path = '/usr/share/fonts/truetype/nanum/NanumBarunGothicBold.ttf'  # 설치된 나눔글꼴중 원하는 녀석의 전체 경로를 가져옵니다.
font_name = fm.FontProperties(fname=path, size=10).get_name()
print(font_name)
plt.rc('font', family=font_name)

# 현재 설정되어 있는 폰트 사이즈와 글꼴을 알아보자
!python --version
def current_font():
  print(f"설정 폰트 글꼴: {plt.rcParams['font.family']}, 설정 폰트 사이즈: {plt.rcParams['font.size']}")  # 파이썬 3.6 이상 사용가능하다
        
print(current_font())

# 여전히 글꼴이 보이지 않는 분들은, 런타임 -> "다시 시작 및 모두 실행" 을 눌러주세요!
!rm -rf ~/.cache/matplotlib/*
!fc-cache -fv

In [39]:
import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager, rc

plt.figure(figsize=(16, 12))
plt.xlabel('주요 단어')
plt.ylabel('빈도수')
plt.grid(True)

Sorted_Dict_Values = sorted(wordInfo.values(), reverse=True)
Sorted_Dict_Keys = sorted(wordInfo, key=wordInfo.get, reverse=True)

plt.bar(range(len(wordInfo)), Sorted_Dict_Values, align='center')
plt.xticks(range(len(wordInfo)), list(Sorted_Dict_Keys), rotation='70')
plt.savefig("freq_dist.png")
plt.show()

### WordCloud 그리기.

In [40]:
from wordcloud import WordCloud

from PIL import Image
import numpy as np

mask = np.array(Image.open("../input/korean-text-analysis/cloud.png"))

wordcloud = WordCloud(font_path=path,
                      relative_scaling = 0.2,
                      mask=mask,
                      background_color='white',
                      ).generate_from_frequencies(wordInfo)
plt.figure(figsize=(16,16))
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("wordcloud.png")
plt.show()

## 4. TF-IDF를 통한 주요 단어 분석하기.

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#TfidfVectorizer의 input으로 만들기 위한 전처리.
tfidf_docs = [" ".join(doc) for doc in tokenized_text]

tfidf = TfidfVectorizer()

# tfidf 형태로 변환.
X_tfidf = tfidf.fit_transform(tfidf_docs)

terms = tfidf.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X_tfidf.sum(axis=0)

# connecting term to its sums frequency
df = []
for col, term in enumerate(terms):
    df.append( (term, sums[0,col] ))

ranking = pd.DataFrame(df, columns=['Term','TF-IDF'])
rankInfo = ranking.sort_values('TF-IDF', ascending=False)[:50]
print(rankInfo)

In [42]:
len(tfidf_docs)

### TF-IDF Histogram 

In [43]:
tfidfInfo = dict()

for idx in range(len(rankInfo)):
    term = rankInfo.iloc[idx]["Term"]
    tfidf = rankInfo.iloc[idx]["TF-IDF"]
    tfidfInfo[term] = tfidf

plt.figure(figsize=(16, 12))
plt.xlabel('주요 단어')
plt.ylabel('TF-IDF')
plt.grid(True)

Sorted_Dict_Values = sorted(tfidfInfo.values(), reverse=True)
Sorted_Dict_Keys = sorted(tfidfInfo, key=tfidfInfo.get, reverse=True)

plt.bar(range(len(tfidfInfo)), Sorted_Dict_Values, align='center')
plt.xticks(range(len(tfidfInfo)), list(Sorted_Dict_Keys), rotation='70')
plt.savefig("tfidf_dist.png")
plt.show()

### WordCloud 

In [44]:
from wordcloud import WordCloud

mask = np.array(Image.open("../input/korean-text-analysis/cloud.png"))
wordcloud = WordCloud(font_path=path,
                      relative_scaling = 0.2,
                      mask=mask,
                      background_color='white',
                      ).generate_from_frequencies(tfidfInfo)
plt.figure(figsize=(16,16))
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("tf-idf_wordcloud.png")
plt.show()

## 5. Topic Modeling을 위한 parameter setting 및 데이터 전처리.

In [13]:
!pip install pyldavis==3.2.2

In [14]:
start, end, step = [int(x) for x in input("토픽 개수를 입력하세요.(e.g. 2,10,1) : ").split(",")]

In [15]:
import sys
from operator import itemgetter
from itertools import combinations
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from tqdm import tqdm_notebook
from konlpy.tag import Mecab #Komoran #Mecab #Okt
import numpy as np
import string
import re
import warnings
import networkx as nx
from gensim import corpora
from gensim import models
from gensim.models import TfidfModel
import pyLDAvis
import pyLDAvis.gensim


import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

warnings.simplefilter(action='ignore')

#tfidf로 토픽 모델링을 하는 경우에는 True.
tfidf_mode = True
# 각 토픽모델 결과마다 상위 몇개의 단어를 사용할지.
NUM_TOPIC_WORDS = 30
# semantic network할때, co-occurence count를 상위 몇개를 사용할지.
NUM_WORD_COOCS = 100

write_flag = True

# 토픽 개수 지정
# start 개수부터, end 개수까지 토픽 개수를 지정. 밑의 경우에는 K= 2, 3, 4, 5, 6, 7.
x = range(start, end+1, step)

def build_doc_term_mat(documents):
    """주어진 문서 집합으로 문서-어휘 행렬을 만들어 돌려준다."""
    
    print_log_msg("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    temp_corpus = [dictionary.doc2bow(document) for document in documents]
    
    if tfidf_mode:
        tfidf = TfidfModel(temp_corpus)
        corpus = tfidf[temp_corpus]
        
    else:
        corpus = temp_corpus

    print_log_msg("Done.")
    
    return corpus, dictionary
    


def print_log_msg(msg):
    """로그 메시지를 출력한다."""    
    print(msg, flush=True)
        
        
corpus, dictionary = build_doc_term_mat(tokenized_text)

## 6. Topic Modeling을 Coherence Score로 평가하기.

In [16]:
from gensim.models import CoherenceModel # topic coherence score 계산하는 함수.
from gensim.models.ldamodel import LdaModel # LDA

def compute_coherence(dictionary, corpus, texts, start=2, end=41, step=4):
    """
    Input Parameter:
    
    dictionary - gensim dictionary
    corpus - gensim corpus
    texts - 토크나이징된 실제 문서
    start, end, step - 실제 실험하는 토픽 개수. 아무것도 넣어주지 않으면 2부터 41까지 4씩 키워가며 자동으로 지정.
    """
    
    coherence_score_list = []
    model_list = []
    
    # 모든 지정된 토픽 개수에 대해서 실행.
    for num_topics in tqdm(range(start, end+1, step)):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                         iterations=300,
                         random_state=42, alpha='auto') # passes를 높여주면 성능이 올라감. 보통 100~500 사이 지정.
        
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score_list.append(coherence_model.get_coherence())
        
    return model_list, coherence_score_list


def print_topic_words(model):
    """토픽별 토픽 단어들을 화면에 인쇄한다."""
    
    print_log_msg("Printing topic words.")
    
    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print("Topic ID: {}".format(topic_id))

        for topic_word, prob in topic_word_probs:
            print("\t{}\t{}".format(topic_word, prob))

        print("\n")
        

model_path = "LDA_model_" + dataname + "_(" + str(min(x))+'-'+str(max(x)) +  ")topics.pk"
csScore_path = "coherence-Scores_" + dataname + "_(" + str(min(x))+'-'+str(max(x)) +  ")topics.pk"


if write_flag:
    model_list, coherence_scores = compute_coherence(dictionary=dictionary, corpus=corpus, 
                                                     texts=tokenized_text, start=start, end=end, step=step)

    with open(model_path, 'wb') as f:
        pickle.dump(model_list, f)
        
    with open(csScore_path, 'wb') as f:
        pickle.dump(coherence_scores, f)
    
    with open(csScore_path[:-2]+".txt", 'w') as f:
        for idx, cs in zip(range(start, end+1, step), coherence_scores):
            print("# of Topics : ", idx, file=f)
            print("Coherence Score with C_V : %.3f" % cs, file=f)
            
else:
    if os.path.exists(model_path):
        with open(model_path, 'rb') as f:
            model_list = pickle.load(f)

    if os.path.exists(csScore_path):
        with open(csScore_path, 'rb') as f:
            coherence_scores = pickle.load(f)

### Coherence Score 계산 후 시각화. 

In [17]:
plt.figure(figsize=(16, 16))
label = "Coherence Score(C_V)"
plt.plot(x, coherence_scores, label=label)
plt.scatter(x, coherence_scores)
plt.title("LDA_({}-{})topics".format(str(min(x)), str(max(x))))
plt.xticks(x)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
plt.legend(loc='best')
plt.savefig("LDA_({}-{})topics.png".format(str(min(x)), str(max(x))))
plt.show()

In [18]:
# Coherence Score(CS)가 가장 높은 Topic 개수를 자동으로 찾아서, LDA를 진행함.
coherence_list = np.array(coherence_scores)
model = model_list[np.argmax(coherence_list)]
NUM_TOPICS = model.num_topics # CS가 가장 높은 토픽 개수.
print("Number of Topics : ", NUM_TOPICS)
print_topic_words(model)

In [19]:
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

## Semantic Network 그리기.

### 전체 단어에 대해서 토픽 모델링 상위 결과로 의미 연결망 만들기. 

In [20]:
def get_topic_documents(model):
    """주어진 토픽 모델링 결과에서 토픽 문서를 생성하여 돌려준다."""
    
    print_log_msg("Generating topic word documents.")
    topic_documents = []
    
    for topic_id in range(model.num_topics):
        topic_document = []
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)

        for topic_word, prob in topic_word_probs:
            topic_document.append(topic_word)
            
        topic_documents.append(topic_document)
        
    return topic_documents

def build_word_cooc_mat(model):
    """주어진 토픽 모델링 결과에서 어휘 공기 행렬을 생성하여 돌려준다."""
    
    print_log_msg("Building topic word co-occurrence matrix.")
    word_cooc_mat = defaultdict(Counter)
    topic_documents = get_topic_documents(model)
    
    for topic_document in topic_documents:
        for word1, word2 in combinations(topic_document, 2):
            word_cooc_mat[word1][word2] += 1
            
    return word_cooc_mat

def get_sorted_word_coocs(word_cooc_mat):
    """주어진 어휘 공기 행렬에서 공기 빈도로 역술 정렬된 행렬을 생성하려 돌려준다."""
    
    print_log_msg("Sorting topic word occurrence.")
    word_coocs = []
    
    for word1, word2_counter in word_cooc_mat.items():
        for word2, count in word2_counter.items():
            word_coocs.append((word1, word2, count))
            
    sorted_word_coocs = sorted(word_coocs, key=itemgetter(2), reverse=True)
    
    
    return sorted_word_coocs


def build_word_cooc_network(sorted_word_coocs):
    """토픽 단어 공기 네트워크를 생성하여 돌려준다."""
    
    print_log_msg("Generating topic word co-occurrence network.")
    G = nx.Graph()

    for word1, word2, count in sorted_word_coocs[:NUM_WORD_COOCS]:
        G.add_edge(word1, word2, weight=count)
        
    return G


def return_log_scaled_lst(input_lst):
    r_lst = map(np.log, input_lst)
    try:
        max_v = max(map(np.log, input_lst))
        min_v = min(map(np.log, input_lst))
        return map(lambda v: (v-min_v)/(max_v-min_v), r_lst) #min-max scaling in log-scale.
    except: 
        print(input_lst)
        
def print_log_msg(msg):
    """로그 메시지를 출력한다."""
    
    print(msg, flush=True)


def plot_weighted_graph(sorted_word_coocs):
    "Plot a weighted graph"
    
    print_log_msg("Generating topic word co-occurrence network.")
    
    plt.figure(figsize=(16, 12))
    plt.axis('off')
    plt.title('Semantic Network from LDA')
    plt.margins(x=0.05, y=0.05)
    
    G = nx.Graph()

    for word1, word2, count in sorted_word_coocs[:NUM_WORD_COOCS]:
        G.add_edge(word1, word2, weight=count)
        
    pos = nx.spring_layout(G, scale=3.)
        
    #measures = nx.in_degree_centrality(G)
    #measures = nx.betweenness_centrality(G)
    measures = nx.closeness_centrality(G)
    node_weight_lst = return_log_scaled_lst([n_weight for n_weight in measures.values()])
    edge_weight_lst = return_log_scaled_lst([e[2]['weight'] for e in G.edges(data=True)])
    
    
    all_weights = []

    for (node1,node2,data) in G.edges(data=True):
        all_weights.append(data['weight']) # we'll use this when determining edge thickness
 
    unique_weights = list(set(all_weights))
    #print(sum(all_weights)) = 280
    nodes = nx.draw_networkx_nodes(G, pos, node_size=list(map(lambda x: x*2000, node_weight_lst)),
                                   node_color="green",
                                   alpha=0.65,
                                   nodelist=list(measures.keys())
                                  )
    #nodes.set_norm(mcolors.SymLogNorm(linthresh=0.01, linscale=1))
    
    labels = {}
    for node_name in measures.keys():
        labels[str(node_name)] = str(node_name)
    nx.draw_networkx_labels(G, pos, labels, font_size=16, font_family=font_name)


    for weight in unique_weights:
        weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in G.edges(data=True) if edge_attr['weight']==weight]
        width = weight #sum(all_weights) * 300.0
        nx.draw_networkx_edges(G,pos,edgelist=weighted_edges,width=width, alpha=0.5)
 
    print_log_msg("Drawing topic word network.")
    plt.savefig("network_plot.png") 
    plt.show()
    
    return G


word_cooc_mat = build_word_cooc_mat(model)
sorted_word_coocs = get_sorted_word_coocs(word_cooc_mat)
G = plot_weighted_graph(sorted_word_coocs)

### 주어진 단어에 대해서 동시 등장(연어) 횟수가 높은 의미연결망 만들기. 

In [21]:
from nltk import bigrams

"""
전체 corpus의 bigram을 구한 뒤, 분석 키워드를 중심으로 Semantic Network를 생성한다.
"""
freq_analysis = False

def bigram_function(documents):
    
    bigram_corpus = []
    
    for doc in documents:
        bigram_corpus += bigrams(doc)

    return bigram_corpus


def frequency_checking(documents, N):

    total_tokens = [token for doc in documents for token in doc]
    print("Total tokens : ", len(total_tokens))
    
    text = nltk.Text(total_tokens, name='freqs')
    print("Unique tokens : ", len(set(text.tokens)))
    pprint(text.vocab().most_common(N))
    

def bigram_frequency_checking(bigrams, N):

    print("\n\nTotal bigrams : ", len(bigrams))

    freq_dict = {}
    
    for bigram in bigrams:
        freq_dict[bigram] = freq_dict.get(bigram, 0) + 1
#         if bigram in freq_dict:
#             freq_dict[bigram] += 1
#         else:
#             freq_dict[bigram] = 1
        
            
    sorted_freq_dict = sorted(freq_dict.items(), key=itemgetter(1), reverse=True)

    print("Unique bigrams : ", len(set(bigrams)))
    
    for k, v in sorted_freq_dict[:N]:
        print(k, v)

        
def build_sorted_keywords(bigrams, keyword, N):

    freq_dict = {}
    
    for bigram in bigrams:
        if keyword in bigram:
            freq_dict[bigram] = freq_dict.get(bigram, 0) + 1
            
    sorted_freq_dict = sorted(freq_dict.items(), key=itemgetter(1), reverse=True)

    if freq_analysis:
        print("\nNumber of bigrams : ", len(sorted_freq_dict))
        
        for k, v in sorted_freq_dict[:N]:
            print(k, v)

    return sorted_freq_dict[:N]   
    
    
def build_word_sim_network(sorted_word_sims, keyword):
    """어휘 유사도 네트워크를 생성하여 돌려준다."""
    
    G = nx.Graph()
    #max_count = 30

    for bigram, count in sorted_word_sims:
        word1, word2 = bigram[0], bigram[1]
        
        if word1 == keyword:
#             if max_count == 0:
#                 break
#             max_count -= 1
            #print(word1, word2, sim)
            G.add_edge(word1, word2, weight=count)
            
        elif word2 == keyword:
#             if max_count == 0:
#                 break
#             max_count -= 1
            #print(word1, word2, sim)
            G.add_edge(word2, word1, weight=count)
            
    T = nx.minimum_spanning_tree(G)

    return T


def draw_network(G):
    """어휘 공기 네트워크를 화면에 표시한다."""
    
    plt.figure(figsize=(16, 12))
    nx.draw_networkx(G,
                     pos=nx.spring_layout(G, k=0.8),
                     node_size=1000,
                     node_color="green",
                     alpha=0.65,
                     font_family=font_name,
                     with_labels=True,
                     font_size=13)

    plt.axis("off")
    plt.savefig(f"semantic_network({dataname})_collocation.pdf")
    plt.show()

"""
어휘 유사도 행렬을 구성한 뒤 이를 네트워크로 시각화한다.
"""

documents = tokenized_text
bigram_corpus = bigram_function(documents)
N = int(input("\nTop N? : "))
if freq_analysis:
    frequency_checking(documents, N)
    bigram_frequency_checking(bigram_corpus, N)

keyword = input("\n키워드를 입력해주세요 : ")
sorted_bigrams = build_sorted_keywords(bigram_corpus, keyword, N)
G = build_word_sim_network(sorted_bigrams, keyword)
draw_network(G)

### 주어진 단어에 대해서 동시 등장(co-occurence)가 높은 의미 연결망 만들기 

In [22]:
"""
전체 corpus의 co-occurence를 구한 뒤, 분석 키워드를 중심으로 Semantic Network를 생성한다.
"""

def build_doc_term_mat(documents):
    """주어진 문서 집합으로부터 문서-어휘 행렬을 생성하여 돌려준다."""
    print("Building a Document-Term Matrix.")
    vectorizer = CountVectorizer(tokenizer=str.split, binary=True)
    doc_term_mat = vectorizer.fit_transform(documents)   
    words = vectorizer.get_feature_names()
    
    return doc_term_mat, words
        
    
def build_word_cooc_mat(doc_term_mat):
    """주어진 문서-어휘 행렬부터 어휘 공기 행렬을 생성하여 돌려준다."""
    
    print("Building a co-occurrence matrix.")
    word_cooc_mat = doc_term_mat.T * doc_term_mat
    word_cooc_mat.setdiag(0)
    
    return word_cooc_mat


def get_word_sim_mat(word_cooc_mat):
    """주어진 어휘 공기 행렬에 대하여 어휘 유사도 행렬을 구하여 돌려준다."""
    
    print("Calculating Similarity matrix.")
    word_sim_mat = pdist(word_cooc_mat.toarray(), metric="correlation")
    word_sim_mat = squareform(word_sim_mat)
    
    return word_sim_mat


def get_sorted_word_sims(word_sim_mat, words):
    """주어진 어휘 유사도 행렬을 정렬하여 출력한다."""
    
    print("Now Sorting..")
    word_sims = []
    
    for i, j in combinations(range(len(words)), 2):
        sim = word_sim_mat[i, j]
        
        if sim == 0:
            continue
            
        word_sims.append((words[i], words[j], sim))
        
    sorted_word_sims = sorted(word_sims, key=itemgetter(2), reverse=True)
#     for k, v in sorted_word_sims[:N]:
#         print(k, v)

    return sorted_word_sims

def build_word_sim_network(sorted_word_sims, keyword, N):
    """어휘 유사도 네트워크를 생성하여 돌려준다."""
    
    G = nx.Graph()
    
    max_count = N
    for word1, word2, sim in sorted_word_sims:
        
        if word1 == keyword:
            if max_count == 0:
                break
            max_count -= 1
#            print(word1, word2, sim)
            G.add_edge(word1, word2, weight=sim)
            
        elif word2 == keyword:
            if max_count == 0:
                break
            max_count -= 1
#            print(word1, word2, sim)
            G.add_edge(word2, word1, weight=sim)
        
    T = nx.minimum_spanning_tree(G)

    return T


def draw_network(G):
    """어휘 공기 네트워크를 화면에 표시한다."""
    
    print("Done.")
    plt.figure(figsize=(16, 12))
    nx.draw_networkx(G,
                     pos=nx.spring_layout(G, k=0.8),
                     node_size=1000,
                     node_color="green",
                     font_family=font_name,
                     with_labels=True,
                     alpha=0.65,
                     font_size=13)

    plt.axis("off")
    plt.savefig("co-occurence_semantic_network.pdf")
    plt.show()
    
    

documents = [" ".join(tokens) for tokens in tokenized_text]
doc_term_mat, words  = build_doc_term_mat(documents)
word_cooc_mat = build_word_cooc_mat(doc_term_mat)
N = int(input("\nTop N? : "))
keyword = input("키워드를 입력해주세요 : ")
word_sim_mat = get_word_sim_mat(word_cooc_mat)
sorted_word_sims = get_sorted_word_sims(word_sim_mat, words)
G = build_word_sim_network(sorted_word_sims, keyword, N)
draw_network(G)