In [1]:
import sys
sys.path.append('../py/')
sys.path.append('../../soynlp/')
import configuration as config

In [2]:
corpus_fnames = config.normalized_corpus_fnames()
corpus_indexs = config.normalized_corpus_index()

In [61]:
from collections import Counter
import math
import pickle
import sys
from soynlp.word import WordExtractor
from soynlp.utils import DoublespaceLineCorpus
from soynlp.tokenizer import LTokenizer

def tokenize_corpus(scores, corpus, tokenized_corpus_fname):
    tokenizer = LTokenizer(scores=scores)
    with open(tokenized_corpus_fname, 'w', encoding='utf-8') as f:
        for n_doc, doc in enumerate(corpus):
            tf = Counter(tokenizer.tokenize(doc, remove_r=True))
            tf = sorted(tf.items(), key=lambda x:x[1], reverse=True)
            tf = ' '.join(['%s//%d' % t for t in tf])
            f.write('%s\n' % tf)
            if (n_doc + 1) % 100 == 0:
                sys.stdout.write('\rtokenizing ... (%d in %d)' % (n_doc+1, len(corpus)))

for corpus_fname in corpus_fnames:
    corpus_name = corpus_fname.split('/')[-1][:-4]
    
    corpus = DoublespaceLineCorpus(corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_count=10)    
    word_extractor.train(corpus)    
    
    model_fname = '%s/%s.wordextraction' % (config.model_folder, corpus_name)
    word_extractor.save(model_fname)
    
    score_fname = '%s/%s.scores' % (config.model_folder, corpus_name)
    scores = word_extractor.word_scores()
    with open(score_fname, 'wb') as f:
        pickle.dump(scores, f)
    
    print('%s trained' % corpus_name)
    
    corpus.iter_sent = False    
    cohesion = {word:score.cohesion_forward for word, score in scores.items()}
    tokenized_corpus_fname = '%s/%s_cohesion_tokenized.txt' % (config.model_folder, corpus_name)
    tokenize_corpus(cohesion, corpus, tokenized_corpus_fname)
    
    branching_entropy = {word:score.right_branching_entropy for word, score in scores.items()}
    tokenized_corpus_fname = '%s/%s_rbe.txt' % (config.model_folder, corpus_name)
    tokenize_corpus(branching_entropy, corpus, tokenized_corpus_fname)
    
    csbe = {word:(score.right_branching_entropy*score.cohesion_forward) for word, score in scores.items()}
    tokenized_corpus_fname = '%s/%s_csbe.txt' % (config.model_folder, corpus_name)
    tokenize_corpus(csbe, corpus, tokenized_corpus_fname)
    
    print('\r%s tokenized\n' % corpus_name)

training was done. used memory 1.414 Gb
all cohesion probabilities was computed. # words = 125927
all branching entropies was computed # words = 351956
all accessor variety was computed # words = 351956
2016-10-27_article_all_normed trained
2016-10-27_article_all_normed tokenized

training was done. used memory 1.138 Gb
all cohesion probabilities was computed. # words = 127687
all branching entropies was computed # words = 356498
all accessor variety was computed # words = 356498
2016-10-20_article_all_normed trained
2016-10-20_article_all_normed tokenized

training was done. used memory 1.130 Gb
all cohesion probabilities was computed. # words = 124616
all branching entropies was computed # words = 349041
all accessor variety was computed # words = 349041
2016-10-26_article_all_normed trained
2016-10-26_article_all_normed tokenized

training was done. used memory 1.146 Gb
all cohesion probabilities was computed. # words = 106132
all branching entropies was computed # words = 297133
al

In [None]:
from collections import Counter
from konlpy.tag import Kkma, Twitter, Hannanum

class KkmaWrapping:
    def __init__(self):
        self.kkma = Kkma()
    def tf(self, doc):
        words = Counter(self.kkma.pos(doc))
        words = [('%s/%s'%(word, tag), freq) for (word, tag), freq in words.items() if tag[0] == 'N' or tag[0] == 'V']
        return words
    
class TwitterWrapping:
    def __init__(self):
        self.twitter = Twitter()
    def tf(self, doc):
        words = Counter(self.twitter.pos(doc))
        words = [('%s/%s'%(word, tag), freq) for (word, tag), freq in words.items() if tag == 'Noun' or tag == 'Verb' or tag == 'Adjective']
        return words
    
class HannanumWrapping:
    def __init__(self):
        self.hannanum = Hannanum()
    def tf(self, doc):
        words = Counter(self.hannanum.pos(doc))
        words = [('%s/%s'%(word, tag), freq) for (word, tag), freq in words.items() if tag[0] == 'N' or tag[0] == 'P']
        return words


In [None]:
if False:
    import sys
    sys.path.append('../py/')
    sys.path.append('../../soynlp/')
    import configuration as config
    from soynlp.utils import DoublespaceLineCorpus
    
    corpus_fnames = config.normalized_corpus_fnames()
    
    from konlpy.tag import Kkma, Twitter, Hannanum
    taggers = [('twitter', TwitterWrapping()), ('hannanum', HannanumWrapping()), ('kkma', KkmaWrapping())]
    for name, tagger in taggers:
        for corpus_fname in corpus_fnames:
            corpus_name = corpus_fname.split('/')[-1][:-4]    
            corpus = DoublespaceLineCorpus(corpus_fname, iter_sent=False)
            
            tokenized_corpus_fname = '%s/%s_%s.txt' % (config.model_folder, corpus_name, name)
            with open(tokenized_corpus_fname, 'w', encoding='utf-8') as f:
                for n_doc, doc in enumerate(corpus):
                    tf = tagger.tf(doc)
                    tf = ' '.join(['//'.join(t) for t in tf])
                    f.write('%s\n' % tf)
                    if (n_doc + 1) % 100 == 0:
                        sys.stdout.write('\rtokenizing ... (%d in %d)' % (n_doc+1, len(corpus)))
