# 한글 형태소 분석기 - SoyNlp

In [1]:
import soynlp
soynlp.__version__

'0.0.493'

In [2]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt", filename="data/2016-10-20.txt")  # 이 데이터가 말뭉치
# 원하는 단어들을 학습해서 사용하는 형태임. 큰 말뭉치를 집어넣어주면 스스로 학습. 분석할 데이터가 많지않으면 soynlp가 제대로 작동X

('data/2016-10-20.txt', <http.client.HTTPMessage at 0x22141ab6f10>)

- SOYNLP에서 사용될 단어를 만들기 (훈련 데이터를 다수의 문서로 분리)

In [4]:
from soynlp import DoublespaceLineCorpus
corpus = DoublespaceLineCorpus("data/2016-10-20.txt")
len(corpus)


30091

- Wordextractor를 통해서 단어를 도출할 수 있게 함

In [5]:
# 전체 말뭉치에서 단어 점수표를 계산 (학습)
from soynlp.word import WordExtractor
word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score_table = word_extractor.extract()

training was done. used memory 0.752 Gb
all cohesion probabilities was computed. # words = 223348
all branching entropies was computed # words = 361598
all accessor variety was computed # words = 361598


- 원하는 상태로 Tokenizer을 할 수 있다.

In [7]:
from soynlp.tokenizer import LTokenizer

scores = {word:score.cohesion_forward for word, score in word_score_table.items()}
l_tokenizer = LTokenizer(scores=scores)
l_tokenizer.tokenize("국제사회와 우리의 노력들로 범죄를 척결하자", flatten=False)

[('국제사회', '와'), ('우리', '의'), ('노력', '들로'), ('범죄', '를'), ('척결', '하자')]

- 파일로 저장

In [9]:
import joblib

joblib.dump(scores, 'data/scores.pkl')

['data/scores.pkl']

- 명사 추출

In [13]:
from soynlp.noun import LRNounExtractor_v2

noun_extractor = LRNounExtractor_v2(verbose = True)
nouns = noun_extractor.train_extract(corpus) # list of str like

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 403896 from 30091 sents. mem=0.922 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=4434442, mem=1.594 Gb
[Noun Extractor] batch prediction was completed for 119705 words
[Noun Extractor] checked compounds. discovered 70639 compounds
[Noun Extractor] postprocessing detaching_features : 109312 -> 92205
[Noun Extractor] postprocessing ignore_features : 92205 -> 91999
[Noun Extractor] postprocessing ignore_NJ : 91999 -> 90643
[Noun Extractor] 90643 nouns (70639 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.711 Gb                    
[Noun Extractor] 76.63 % eojeols are covered


In [16]:
noun_scores = {noun:score[0] for noun, score in nouns.items() if len(noun) > 1}
joblib.dump(noun_scores, 'data/noun_scores.pkl')



['data/noun_scores.pkl']

- 한글로 된 워드클라우드 만들기