# 한국어 워드클라우드 시각화

---
- 2021-02-23 최초 작성
- 2021-03-05 불용어 입력 추가

In [None]:
# @title Step1: 모듈 설치 및 폰트 설정 (⌘ + Enter)
%%capture
!pip install konlpy
!pip install wordcloud

font_address = 'https://github.com/jaekookang/Korean-WordCloud/blob/master/font/SeoulNamsanvert.ttf?raw=true'
!wget {font_address}
!mv SeoulNamsanvert.ttf?raw=true SeoulNamsanvert.ttf

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from wordcloud import WordCloud
from konlpy.tag import Okt
from collections import Counter
print('모듈 설치 완료')

In [None]:
# @title Step2: 텍스트 파일 업로드 (⌘ + Enter)
import ipywidgets as widgets
import chardet
from tqdm.auto import tqdm
from google.colab import files
uploaded = files.upload()

def detect_encoding(txt_file, convert_utf8=True):
    with open(txt_file, 'rb') as f:
        result = chardet.detect(f.read())
    encoding = result['encoding']
    if encoding != 'utf-8':
        new_txt_file = f'_{txt_file}'
        !iconv -f cp949 -t UTF-8 {txt_file} > {new_txt_file}
        !mv {new_txt_file} {txt_file}
        print(f'  {txt_file} utf8 converted')

if len(uploaded.keys()) > 1:
    # 두 개 이상의 파일을 업로드한 경우
    txt_file = '_combined_.txt'
    txt_files = list(uploaded.keys())
    with open(txt_file, 'w') as target_f:
        for file_name in tqdm(txt_files, total=len(txt_files), desc='텍스트병합'):
            detect_encoding(file_name)
            with open(file_name, 'r') as source_f:
                for line in source_f.readlines():
                    target_f.write(line)
else:
    # 한 개의 파일을 업로드한 경우
    txt_file = list(uploaded.keys())[0]
    detect_encoding(txt_file)

In [None]:
# @title Step3: Stopwords 제거 (단어 사이에 콤마(,)를 반드시 넣고, 실행해주세요) { run: "auto", vertical-output: true }
stopwords = "\uAE09\uC218,\uACBD\uC6B0,\uC694\uAE08" #@param {type:"string"}

stopwords = stopwords.strip().split(',')
if stopwords == ['']:
    print('stopwords가 입력되지 않았습니다.')
elif len(stopwords) > 0:
    print(f'{len(stopwords)}개의 stopwords가 성공적으로 입력되었습니다.')
else:
    print('stopwords가 입력되지 않았습니다.')

In [None]:
# @title Step4: 워드 클라우드 생성하기 (⌘ + Enter)
def filter_stopwords(nouns, stopwords):
    filtered = [noun for noun in nouns if noun not in stopwords]
    return filtered

def extract_nouns(txt, stopwords=[''], n_most_common=100):
    okt = Okt()
    nouns = okt.nouns(txt)
    # 불용어 제거
    if (len(stopwords) > 0) & (stopwords != ['']):
        nouns = filter_stopwords(nouns, stopwords)
    # 빈도=1 제거
    for i, noun in enumerate(nouns):
        if len(noun) < 2:
            nouns.pop(i)
    # 빈도 세기
    count = Counter(nouns)
    freq_nouns = count.most_common(n_most_common)
    return freq_nouns

def make_wordcloud(nouns, 
                   max_words=100,
                   width=1000,
                   height=1000,
                   max_font_size=300,
                   background_color='white',
                   save_file='wordcloud.png', 
                   font_path='SeoulNamsanvert.ttf',
                   show_plot=True):
    
    wc = WordCloud(font_path=font_path,
                   background_color=background_color,
                   width=width,
                   height=height,
                   max_words=max_words,
                   max_font_size=max_font_size)
    
    wc.generate_from_frequencies(dict(nouns))
    wc.to_file(save_file)

    if show_plot:
        img = mpimg.imread(save_file)
        fig, ax = plt.subplots(1, facecolor='white', figsize=(10,10))
        ax.imshow(img)
        ax.set_xticks([])
        ax.set_yticks([])
    else:
        print('Saved')

if not 'stopwords' in locals():
    stopwords = ['']

# 파일 읽기
with open(txt_file, 'rt', encoding='utf-8') as f:
    txt_bytes = f.read()
    noun_list = extract_nouns(txt_bytes, stopwords=stopwords, n_most_common=100)

# 워드클라우드 만들기
make_wordcloud(noun_list, show_plot=True)

---
참고
- https://liveyourit.tistory.com/58
- https://imworld.tistory.com/59