In [4]:
!export | grep $LANG

export LANG='ko_KR.UTF-8'


In [5]:
# -*- coding: utf-8 -*-
# Copyright (C) 2018.02.09 kyung seok jeong <humanist96@koscom.co.kr>
from __future__ import absolute_import, unicode_literals
from natto import MeCab
import pandas as pd
import collections
import re
import datrie
import string

In [6]:
def load_stopword(fpath):
    """
    Return the trie object of stopword dictionary
    - input : stopword file path
    - output : trie instance
    """
    _escape_pattern = ['\n']
    
    try:
        trie=datrie.Trie(ranges=[(u'\u0000', u'\uFFFF')])

        with open(fpath, "rb", 0) as f:
            for word in f.readlines():
                word=word.decode("utf-8").rstrip()           
                trie[word] = True
    except Exception as e:
        print("[load_storpwod] messages of error :", e)
        return ''
    
    return trie

In [7]:
def is_stopword(morpheme, trie):
    """
    Returns the presence or absence of stopword in stopword dictionary.
    - input : morpheme string, trie instance
    - output : boolean (Ture, False)
    """
    if morpheme in trie:
        return True
    
    return False

In [8]:
def run_ma(text, stop_path='', nBest=1):
    """
    Returns the dataframe of all Information of morpheme analyzer.
    - input : string, {stopword file path}, {nbest number}
    - output : dataframe
    """
    options=r'-F%m,%f[0],%f[1],%f[2],%f[3],%f[4],%f[5],%f[6],%f[7]\n'
    options+=" -N"+str(nBest)
    
    stopword_flag=False
    
    if stop_path != '':
        stopword_flag=True
    try:   
        _me=MeCab(options)

        _df = pd.DataFrame(None, columns=['surface', 'tag', 'meaning_class', 'final_consonant', 
                                         'reading', 'type', 'first_tag', 'final_tag','expression'])

        if stopword_flag:
            trie=load_stopword(stop_path)

        i=0
        for term_str in str(_me.parse(text)).split('\n'):
            term_list = re.split(',', term_str)

            if stopword_flag == True and is_stopword(term_list[0], trie):
                continue
            if len(term_list) < 2:
                continue

            _df.loc[i]=term_list   
            i+=1
    except Exception as e:
        print("[run_ma] messages of error : ", e)
        
    return _me, _df

me, df=run_ma("빅데이터 커뮤니티는 너무 어려운것 같다", "./stopword.txt")
print(me)
print(df)
help(run_ma)

<natto.mecab.MeCab model=<cdata 'mecab_model_t *' 0x3ac9340>, tagger=<cdata 'mecab_t *' 0x3aeca60>, lattice=<cdata 'mecab_lattice_t *' 0x3ae9750>, libpath="/usr/local/lib/libmecab.so", options={'nbest': 1, 'node_format': '%m,%f[0],%f[1],%f[2],%f[3],%f[4],%f[5],%f[6],%f[7]\\n'}, dicts=[<natto.dictionary.DictionaryInfo dictionary=<cdata 'mecab_dictionary_info_t *' 0x3aea8d0>, filepath="/usr/local/lib/mecab/dic/mecab-ko-dic/sys.dic", charset=UTF-8, type=0>], version=0.996/ko-0.9.2>
  surface     tag meaning_class final_consonant reading     type first_tag  \
0       빅     NNG                             T       빅                      
1     데이터     NNG                             F     데이터                      
2    커뮤니티     NNG                             F    커뮤니티                      
3       는      JX                             T       는                      
4      너무     MAG     성분부사/정도부사               F      너무                      
5     어려운  VA+ETM                             T 

In [19]:
print(df)

def get_all_morph(df):
    """
    Returns all morphemes and Part-of-Speech.
    - input : dataframe
    - output : string
    """
    ret=''
    for index, row in df.iterrows():      
        if row['type'] == 'Inflect' or row['type'] == 'Compound':
            tag=row['expression']
            ret+=tag.replace('+',' ').replace("/*", '')+" "
        else:
            tag=row['tag']
            ret+=row['surface']+"/"+tag+" "        
    ret=ret.rstrip()
    ret=ret+"\n"
    
    return(ret)

get_all_morph(df)


  surface     tag meaning_class final_consonant reading     type first_tag  \
0       빅     NNG                             T       빅                      
1     데이터     NNG                             F     데이터                      
2    커뮤니티     NNG                             F    커뮤니티                      
3       는      JX                             T       는                      
4      너무     MAG     성분부사/정도부사               F      너무                      
5     어려운  VA+ETM                             T     어려운  Inflect        VA   
6       것     NNB                             T       것                      
7       같      VA                             T       같                      
8       다      EC                             F       다                      

  final_tag       expression  
0                             
1                             
2                             
3                             
4                             
5       ETM  어렵/VA/*+ᆫ/ETM/*  
6 

'빅/NNG 데이터/NNG 커뮤니티/NNG 는/JX 너무/MAG 어렵/VA ᆫ/ETM 것/NNB 같/VA 다/EC\n'

In [20]:
print(df)

def get_noun_morph(df, option='N'):
    """
    Returns noun morphemes and Part-of-Speech.
    - input : dataframe, {option : compound noun decomposition flag, default : N}
    - output : string
    """
    _noun_type = ['NNG', 'NNP']
    ret=''
    
    for index, row in df.iterrows():
        if row['tag'] in _noun_type:
            if row['type'] == 'Compound' and option != 'N':
                tag=row['expression']
                ret+=tag.replace('+',' ').replace("/*", '')+" "
            else:
                ret+=row['surface']+"/"+row['tag']+" " 
    ret=ret.rstrip()
    ret=ret+"\n"
    
    return(ret)

get_noun_morph(df)

  surface     tag meaning_class final_consonant reading     type first_tag  \
0       빅     NNG                             T       빅                      
1     데이터     NNG                             F     데이터                      
2    커뮤니티     NNG                             F    커뮤니티                      
3       는      JX                             T       는                      
4      너무     MAG     성분부사/정도부사               F      너무                      
5     어려운  VA+ETM                             T     어려운  Inflect        VA   
6       것     NNB                             T       것                      
7       같      VA                             T       같                      
8       다      EC                             F       다                      

  final_tag       expression  
0                             
1                             
2                             
3                             
4                             
5       ETM  어렵/VA/*+ᆫ/ETM/*  
6 

'빅/NNG 데이터/NNG 커뮤니티/NNG\n'

In [21]:
def get_noun_term_freq(df, option='N'):
    """
    Returns noun morphemes and freqeuncy
    - input : dataframe, {option : compound noun decomposition flag, default : N}
    - output : list of tuples(morpheme, frequency)
    """
    _noun_type = ['NNG', 'NNP']
    _terms = []
    
    for index, row in df.iterrows():
        if row['tag'] in _noun_type:
            if row['type'] == 'Compound' and option != 'N':
                tag=row['expression']
                _terms.extend(re.split(' ', tag.replace('+',' ').replace("/*", '')))
            else:
                _terms.append(row['surface'])
                
    return sorted(collections.Counter(_terms).items(), key=lambda x: x[1], reverse=True)

get_noun_term_freq(df)

[('데이터', 1), ('빅', 1), ('커뮤니티', 1)]