# customized KoNLPy

https://github.com/lovit/customized_konlpy

In [1]:
import itertools
import math
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline

from konlpy.tag import Okt
okt = Okt()
from ckonlpy.tag import Twitter
twitter = Twitter()
from collections import Counter
from pandas import read_excel
from lxml import etree
from tqdm import tqdm
from tqdm import trange

### Gensim
import gensim
from gensim import corpora
# from gensim.models.word2vec import Word2Vec
from gensim.models import FastText
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel

from hanpre import funcs as hp
from hanpre import stopwords as sw

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [2]:
def interested_words():
    # 핵심단어 읽어 오기
    my_sheet = '소비키워드'
    keywords_filename = 'deskresearch_.xlsx'
    df = read_excel(keywords_filename, sheet_name = my_sheet, header=1) # index_col='번호'
    keywords = df['핵심단어']
    subkeywords = df['대체어']
    interested_words = df['키워드']
    return keywords, subkeywords, interested_words

my_weights = [
    ('num_nouns', -0.1),
    ('num_words', -0.2),
    ('no_noun', -1),
    ('len_sum_of_nouns', 0.2)
]

def my_evaluate_function(candidate):
    num_nouns = len([word for word, pos, begin, e in candidate if pos == 'Noun'])
    num_words = len(candidate)
    has_no_nouns = (num_nouns == 0)
    len_sum_of_nouns = 0 if has_no_nouns else sum(
        (len(word) for word, pos, _, _ in candidate if pos == 'Noun'))

    scores = (num_nouns, num_words, has_no_nouns, len_sum_of_nouns)
    score = sum((score * weight for score, (_, weight) in zip(scores, my_weights)))
    return score

def oneDArray(x):
    return list(itertools.chain(*x))

def getTopics(model):
    topics = []
    for topic in model.print_topics(num_words=500):
        i=1
        model_words=[]
        topic_words=str(topic).split('"')
        for words in topic_words:
            if i%2==0:
                model_words.append(words)
            i+=1
        topics.append(model_words)
    return topics

def save_to_csv(output, keyword, type, tf_df):
    # save to csv    
    filename = "./output/" + output + "_" + type + "_" + keyword.replace(" ","") + ".csv"   
    # filename_list.append(filename)
    tf_df.to_csv(filename, date_format='%Y%m%d', encoding='utf-8-sig')


keywords, subkeywords, interested_words = interested_words()

In [3]:
# keyword 전체 : 35개
for keyword, subkeyword, interested_word in zip(keywords, subkeywords, interested_words):

    # ## 테스트 용 : 1개
    #keyword = keywords[0]
    #subkeyword = subkeywords[0]
    #interested_word = interested_words[0]

    # 관심어(=명사) 추가
    subkeyword = subkeyword.replace(" ", "").replace(",","|")
    interested_word = subkeyword + "|" + interested_word.replace(" ", "").replace(",","|")

    new_nouns = []
    new_nouns = new_nouns + keyword.split(' ')
    new_nouns.append((keyword.replace(" ", "")))
    new_nouns = new_nouns + interested_word.split("|")
    new_nouns = list(set(new_nouns))

    # new_nouns

    for nouns in new_nouns:
        twitter.add_dictionary(nouns, 'Noun')

    df = hp.readall(keyword.replace(" ",""))
    df = df[ (df['date'] >= '2019-07-01') & (df['date'] < '2020-07-01')]
    df = df.drop_duplicates()

    # df['text'] = df['title'].apply(hp.preprocessing) + df['content'].apply(hp.preprocessing)
    df['text'] = df['title'] + df['content']
    rows_date = df['date']

    sentences = []
    import re
    for posts, dates in zip(df['text'], df['date']):
        for post in re.split('\?|\.|!', posts):
            post = hp.preprocessing(post)
            sentences.append(post)

    # 공백라인 및 NaN 제거
    while("" in sentences) : 
        sentences.remove("") 
    while("NaN" in sentences) : 
        sentences.remove("") 

    # sentences

    # ###########################################################################################
    from soynlp.word import WordExtractor
    word_extractor = WordExtractor()
    word_extractor.train(sentences)
    word_scores = word_extractor.extract()

    from soynlp.tokenizer import LTokenizer
    cohesion_scores = {word:score.cohesion_forward for word, score in word_scores.items()}
    ltokenizer = LTokenizer(scores = cohesion_scores)

    tokened_sentences = []
    for sentence in sentences:
       tokened_sentences.append (ltokenizer.tokenize(sentence))

    # tokened_sentences

    ts = []
    for s in tokened_sentences:
        ts.append(hp.remove_stopwords(s))
    #     print(s)


    # # 공백라인 및 NaN 제거
    while("" in s) : 
        ts.remove("") 
    while("NaN" in sentences) : 
        ts.remove("") 

    # ts

    tsentences = []
    for s in ts:
        line = ""
        for w in s:
            line = line + " " + w
        tsentences.append(line)

    # 공백라인 및 NaN 제거
    while("" in s) : 
        tsentences.remove("") 
    while("NaN" in tsentences) : 
        tsentences.remove("") 

    # 4. 각 문장별로 형태소 구분하기
    sentences_tag = []
    for sentence in tsentences:
        morph = okt.pos(sentence)
        sentences_tag.append(morph)
        
    # 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기
    posts = []
    for sentence1 in sentences_tag:
        words = ""
        for word, tag in sentence1:
            if tag in ['Noun']:     # 명사만 추출, # 명사/형용사 추출은 ['Noun','Adjective']
                words = words + word + " "
        posts.append(words)

    # print(len(posts), type(posts))
    # print(len(rows_date), type(rows))

    #7 공백라인 및 NaN 제거
    while("" in posts) : 
        posts.remove("") 
    while("NaN" in posts) : 
        posts.remove("") 
    # posts

    ## 최빈어

    tf_list = []
    for sentence1 in posts:
        sentence1.split()
        tf_list.append(sentence1.split())

    lst = oneDArray(tf_list)

    # 6. 선별된 품사별 빈도수 계산 & 상위 빈도 10위 까지 출력
    counts = Counter(lst)

    toplist = counts.most_common(500)
    # type(toplist)

    tf_df = pd.DataFrame (toplist,columns=['단어', '빈도수'])
    # tf_df
    #save_to_csv(keyword, "최빈어", tf_df)
    save_to_csv("TM", keyword, "최빈어", tf_df)

    ## 토픽 모델링

    dataset = pd.DataFrame(posts, columns=['문장'])

    # nan_value = float("NaN")
    # dataset.replace("", nan_value, inplace=True)
    # dataset.dropna(subset = ["문장"], inplace=True)
    # dataset.reindex

    tmp_corpus = dataset['문장'].map(lambda x: x.split('.'))

    #tmp_corpus

    # type(tmp_corpus)
    # tmp_corpus[11]
    # tmp_corpus[13]

    # corpus [[w1,w2,w3..],[..]]
    corpus = []
    for i in range(len(tmp_corpus)):
        for line in tmp_corpus[i]:
            words = [x for x in line.split()]
            corpus.append(words)
    #   corpus

    num_of_sentences = len(corpus)
    num_of_words = 0
    for line in corpus:
        num_of_words += len(line)

    #print('Num of sentences - %s'%(num_of_sentences))
    #print('Num of words - %s'%(num_of_words))

    ### Gensim

    nouns = corpus
    bigram = gensim.models.Phrases(nouns)
    trigram = gensim.models.Phrases(bigram[nouns])
    bigram_model = gensim.models.phrases.Phraser(bigram)
    trigram_model = gensim.models.phrases.Phraser(trigram)
    
    bigram_document = [bigram_model[nouns] for nouns in nouns]
    # bigram_document[1]
    # bigram_document[3]

    id2word = corpora.Dictionary(bigram_document)
    corpus = [id2word.doc2bow(doc) for doc in bigram_document]
    # corpus[0]

    # corpus[7]  # 단어별 출현 출현 수

    ### Topic Coherence 계산

    # 토픽 모델링을 수행함에 있어서 적절한 토픽의 갯수를 찾는 것이 중요하다. 
    # 2부터 9까지 값을 늘려가면서 LDA 모델을 생성하여 각 모델의 coherence를 계산한다.
    # 그리고, 적절한 토픽의 수는 토픽 갯수를 늘려가며 높은 coherence score 를 가지는 값으로 결정한다

    coherence_score=[]
    for i in range(2,10):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=i)
        coherence_model = CoherenceModel(model, texts=bigram_document, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model.get_coherence()
        print('n=',i,'\nCoherence Score: ', coherence_lda)
        coherence_score.append(coherence_lda)

#     k=[]
#     for i in range(2,10):
#         k.append(i)

#     x=numpy.array(k)
#     y=numpy.array(coherence_score)
#     title = f'{keyword} Topic Coherence'
#     plt.title(title)
#     plt.plot(x,y)
#     plt.xlim(2,10)
#     plt.xlabel('Number Of Topic (2-10)')
#     plt.ylabel('Cohrence Score')
#     plt.show()

    model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=3)
    # model.print_topics()
    model.print_topics(num_words=500) # num_topics=20

    _topics = getTopics(model)
    #_topics

    #type(_topics)

    co_top_df = pd.DataFrame(_topics)

    co_top_df.set_index(0)

    save_to_csv("TM", keyword, "토픽", co_top_df)
    
    ###################################################################################

    from soynlp.word import WordExtractor

    word_extractor = WordExtractor(min_frequency=1,
        min_cohesion_forward=0.05, 
        min_right_branching_entropy=0.0
    )
    word_extractor.train(tsentences) # list of str or like
    words = word_extractor.extract()

    try:
        words[keyword.replace(" ", "")]
        
        def word_score(score):
            return (score.cohesion_forward * math.exp(score.right_branching_entropy))

        # print('단어   (빈도수, cohesion, branching entropy)\n')
        nouns = []
        freqs = []
        cohesions = []
        entropys = []
        for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:500]:
            nouns.append(word)
            freqs.append(score.leftside_frequency)
            cohesions.append(score.cohesion_forward)
            entropys.append(score.right_branching_entropy)

        words_df = pd.DataFrame({"단어": nouns, "연관성": cohesions, "빈도수": freqs, "엔트로피": entropys})
        words_df = words_df.sort_values(['빈도수','연관성'])

        save_to_csv("ckonlpy", keyword, "종합", words_df)
    
    except:
        continue



training was done. used memory 0.444 Gbory 0.294 Gb
all cohesion probabilities was computed. # words = 29547
all branching entropies was computed # words = 44784
all accessor variety was computed # words = 44784
n= 2 
Coherence Score:  0.4668892156309924
n= 3 
Coherence Score:  0.4580620425150417
n= 4 
Coherence Score:  0.36433866554354855
n= 5 
Coherence Score:  0.43476294133711074
n= 6 
Coherence Score:  0.4090889166736602
n= 7 
Coherence Score:  0.40509714563241006
n= 8 
Coherence Score:  0.40480768498308706
n= 9 
Coherence Score:  0.4071223487605424
training was done. used memory 1.015 Gbory 0.974 Gb
all cohesion probabilities was computed. # words = 206808
all branching entropies was computed # words = 134032
all accessor variety was computed # words = 134032
training was done. used memory 1.079 Gbory 1.042 Gb
all cohesion probabilities was computed. # words = 36621
all branching entropies was computed # words = 62711
all accessor variety was computed # words = 62711
n= 2 
Coheren

n= 8 
Coherence Score:  0.3757780426643603
n= 9 
Coherence Score:  0.40661758750080823
training was done. used memory 1.121 Gbory 1.108 Gb
all cohesion probabilities was computed. # words = 187207
all branching entropies was computed # words = 123923
all accessor variety was computed # words = 123923
training was done. used memory 1.129 Gbory 1.129 Gb
all cohesion probabilities was computed. # words = 23910
all branching entropies was computed # words = 34865
all accessor variety was computed # words = 34865
n= 2 
Coherence Score:  0.35995522437341376
n= 3 
Coherence Score:  0.35826789436246437
n= 4 
Coherence Score:  0.33570137207721024
n= 5 
Coherence Score:  0.38461253502994036
n= 6 
Coherence Score:  0.3296779338626427
n= 7 
Coherence Score:  0.37225537621721766
n= 8 
Coherence Score:  0.36475232515809974
n= 9 
Coherence Score:  0.384594320120199
training was done. used memory 1.109 Gbory 1.117 Gb
all cohesion probabilities was computed. # words = 175280
all branching entropies was

n= 2 
Coherence Score:  0.3651445811823606
n= 3 
Coherence Score:  0.30021456514250827
n= 4 
Coherence Score:  0.34355067047970433
n= 5 
Coherence Score:  0.3903826674237691
n= 6 
Coherence Score:  0.4019806544235698
n= 7 
Coherence Score:  0.4350707797578293
n= 8 
Coherence Score:  0.41942529055495736
n= 9 
Coherence Score:  0.35288167198788156
training was done. used memory 1.286 Gbory 1.290 Gb
all cohesion probabilities was computed. # words = 282885
all branching entropies was computed # words = 180200
all accessor variety was computed # words = 180200
training was done. used memory 1.321 Gbory 1.340 Gb
all cohesion probabilities was computed. # words = 37320
all branching entropies was computed # words = 63123
all accessor variety was computed # words = 63123
n= 2 
Coherence Score:  0.3664403352522194
n= 3 
Coherence Score:  0.3537279055327664
n= 4 
Coherence Score:  0.4138831364519793
n= 5 
Coherence Score:  0.33006537050516216
n= 6 
Coherence Score:  0.3598504489708389
n= 7 
Coh

training was done. used memory 1.275 Gbory 1.307 Gb
all cohesion probabilities was computed. # words = 22040
all branching entropies was computed # words = 37482
all accessor variety was computed # words = 37482
n= 2 
Coherence Score:  0.4083573276962529
n= 3 
Coherence Score:  0.34067655745959785
n= 4 
Coherence Score:  0.3882033393292096
n= 5 
Coherence Score:  0.42912025156628075
n= 6 
Coherence Score:  0.409349014218263
n= 7 
Coherence Score:  0.4064136812192884
n= 8 
Coherence Score:  0.3777773802038065
n= 9 
Coherence Score:  0.4053625703413465
training was done. used memory 1.251 Gbory 1.251 Gb
all cohesion probabilities was computed. # words = 143961
all branching entropies was computed # words = 95721
all accessor variety was computed # words = 95721
training was done. used memory 1.250 Gbory 1.250 Gb
all cohesion probabilities was computed. # words = 16834
all branching entropies was computed # words = 28132
all accessor variety was computed # words = 28132
n= 2 
Coherence Sc