In [1]:
#출처 : https://github.com/haven-jeon/TrainKoSpacing
import argparse
import bz2
import logging
import re
import time
from functools import lru_cache
from timeit import default_timer as timer

import gluonnlp as nlp
import mxnet as mx
import mxnet.autograd as autograd
import numpy as np
from mxnet import gluon
from mxnet.gluon import nn, rnn
from tqdm import tqdm

from utils.embedding_maker import (encoding_and_padding, load_embedding, load_vocab)

In [2]:
# Model class
class korean_autospacing_base(gluon.HybridBlock):
    def __init__(self, n_hidden, vocab_size, embed_dim, max_seq_length,
                 **kwargs):
        super(korean_autospacing_base, self).__init__(**kwargs)
        
        self.in_seq_len = max_seq_length  # 입력 시퀀스 길이
        self.out_seq_len = max_seq_length  # 출력 시퀀스 길이
        self.n_hidden = n_hidden  # GRU의 hidden 개수
        self.vocab_size = vocab_size  # 고유문자개수
        self.max_seq_length = max_seq_length  # max_seq_length
        self.embed_dim = embed_dim  # 임베딩 차원수

        with self.name_scope():
            self.embedding = nn.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim)
            self.conv_unigram = nn.Conv2D(channels=128, kernel_size=(1, self.embed_dim))
            self.conv_bigram = nn.Conv2D(channels=256,
                                         kernel_size=(2, self.embed_dim),
                                         padding=(1, 0))
            self.conv_trigram = nn.Conv2D(channels=128,
                                          kernel_size=(3, self.embed_dim),
                                          padding=(1, 0))
            self.conv_forthgram = nn.Conv2D(channels=64,
                                            kernel_size=(4, self.embed_dim),
                                            padding=(2, 0))
            self.conv_fifthgram = nn.Conv2D(channels=32,
                                            kernel_size=(5, self.embed_dim),
                                            padding=(2, 0))

            self.bi_gru = rnn.GRU(hidden_size=self.n_hidden, layout='NTC', bidirectional=True)
            self.dense_sh = nn.Dense(100, activation='relu', flatten=False)
            self.dense = nn.Dense(1, activation='sigmoid', flatten=False)

    def hybrid_forward(self, F, inputs):
        embed = self.embedding(inputs)
        embed = F.expand_dims(embed, axis=1)
        unigram = self.conv_unigram(embed)
        bigram = self.conv_bigram(embed)
        trigram = self.conv_trigram(embed)
        forthgram = self.conv_forthgram(embed)
        fifthgram = self.conv_fifthgram(embed)

        grams = F.concat(unigram,
                         F.slice_axis(bigram, axis=2, begin=0, end=self.max_seq_length),
                         trigram,
                         F.slice_axis(forthgram, axis=2, begin=0, end=self.max_seq_length),
                         F.slice_axis(fifthgram, axis=2, begin=0, end=self.max_seq_length),
                         dim=1)

        grams = F.transpose(grams, (0, 2, 3, 1))
        grams = F.reshape(grams, (-1, self.max_seq_length, -3))
        grams = self.bi_gru(grams)
        fc1 = self.dense_sh(grams)
        return (self.dense(fc1))

In [3]:
class pred_spacing:
    def __init__(self, model, w2idx, max_seq_len):
        self.model = model
        self.w2idx = w2idx
        self.max_seq_len = max_seq_len
        self.pattern = re.compile(r'\s+')

    @lru_cache(maxsize=None)
    def get_spaced_sent(self, raw_sent):
        raw_sent_ = "«" + raw_sent + "»"
        raw_sent_ = raw_sent_.replace(' ', '^')
        sents_in = [
            raw_sent_,
        ]
        mat_in = encoding_and_padding(word2idx_dic=self.w2idx,
                                      sequences=sents_in,
                                      maxlen=self.max_seq_len,
                                      padding='post',
                                      truncating='post')
        mat_in = mx.nd.array(mat_in, ctx=mx.cpu(0))
        results = self.model(mat_in)
        mat_set = results[0, ]
        preds = np.array(
            ['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]])
        return self.make_pred_sents(raw_sent_, preds)

    def make_pred_sents(self, x_sents, y_pred):
        res_sent = []
        for i, j in zip(x_sents, y_pred):
            if j == '1':
                res_sent.append(i)
                res_sent.append(' ')
            else:
                res_sent.append(i)
        subs = re.sub(self.pattern, ' ', ''.join(res_sent).replace('^', ' '))
        subs = subs.replace('«', '')
        subs = subs.replace('»', '')
        return subs

In [4]:
vocab_file = 'model/w2idx.dic'
embedding_file = 'model/kospacing_wv.np'
model_params = 'model/kospacing.params'
model_type = 'kospacing'
n_hidden = 200
max_seq_len = 200
w2idx, idx2w = load_vocab(vocab_file)
weights = load_embedding(embedding_file)
vocab_size = weights.shape[0]
embed_dim = weights.shape[1]
model = korean_autospacing_base(n_hidden=n_hidden,
                                    vocab_size=vocab_size,
                                    embed_dim=embed_dim,
                                    max_seq_length=max_seq_len)
model.load_parameters(model_params, ctx=mx.cpu(0))
predictor = pred_spacing(model, w2idx, max_seq_len)

In [5]:
sent = input()
sent.replace(" ", "")
print(predictor.get_spaced_sent(sent))

문자열양쪽끝밑단어사이의모든공백을제거하고싶습니다.
문자열 양쪽 끝 밑 단어 사이의 모든 공백을 제거하고 싶습니다. 
