In [32]:
import numpy as np

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

import torch
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer

from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder

from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.predictors import SentenceTaggerPredictor

from postagger_elmo import PosDatasetReader, LstmTagger

torch.manual_seed(1)

token_indexer = ELMoTokenCharactersIndexer()

reader = PosDatasetReader(token_indexers={"tokens": token_indexer})

HIDDEN_DIM = 100
weight_file = 'weights.hdf5'  # 'https://elmoja.blob.core.windows.net/elmoweights/weights.hdf5'
options_file = 'options.json'  # 'https://elmoja.blob.core.windows.net/elmoweights/options.json'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

lstm: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, bidirectional=True, batch_first=True))

# And here's how to reload the model.
vocab2 = Vocabulary.from_files("postagger-elmo_result/vocabulary")
model2 = LstmTagger(word_embeddings, lstm, vocab2)
with open("postagger-elmo_result/model.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))
    
if torch.cuda.is_available():
    cuda_device = 0
else:
    cuda_device = -1
if cuda_device > -1:
    model2.cuda(cuda_device)

predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
sent = "全て の 現実 を 自分 の 方 に 捻じ 曲げる 性格 に 育っ て しまっ た 誠に 遺憾 で ある"
tag_logits2 = predictor2.predict(sent)['tag_logits']

assert(len(sent.split(' ')) == len(tag_logits2))
[f'{token}###{str(np.argmax(logit))}' for token, logit in zip(sent.split(' '), tag_logits2)]

['全て###0',
 'の###0',
 '現実###0',
 'を###0',
 '自分###0',
 'の###0',
 '方###0',
 'に###0',
 '捻じ###0',
 '曲げる###0',
 '性格###0',
 'に###0',
 '育っ###0',
 'て###0',
 'しまっ###0',
 'た###0',
 '誠に###0',
 '遺憾###0',
 'で###0',
 'ある###0']

In [33]:
[f'{token}###{str(softmax(logit))}' for token, logit in zip(sent.split(' '), tag_logits2)]

['全て###[0.86934222 0.07292952 0.05772826]',
 'の###[0.8887794  0.05801573 0.05320488]',
 '現実###[0.92986737 0.03687602 0.03325661]',
 'を###[0.93516603 0.03664256 0.02819141]',
 '自分###[0.9263598  0.03904786 0.03459234]',
 'の###[0.91478141 0.04648098 0.0387376 ]',
 '方###[0.89343189 0.06799893 0.03856917]',
 'に###[0.9203937  0.04862519 0.03098111]',
 '捻じ###[0.90269846 0.06443285 0.03286869]',
 '曲げる###[0.88541345 0.06538274 0.0492038 ]',
 '性格###[0.91606951 0.05133766 0.03259283]',
 'に###[0.89106047 0.06614343 0.0427961 ]',
 '育っ###[0.91435369 0.05150008 0.03414623]',
 'て###[0.89343151 0.06506117 0.04150732]',
 'しまっ###[0.93601506 0.036106   0.02787894]',
 'た###[0.85053127 0.08446402 0.06500471]',
 '誠に###[0.89127032 0.0598754  0.04885428]',
 '遺憾###[0.84652487 0.08621997 0.06725517]',
 'で###[0.83627946 0.09924991 0.06447063]',
 'ある###[0.88468529 0.06223108 0.05308363]']

In [35]:
def to_punct(prob, touten_th=0.06, kuten_th=0.05):
    touten_p, kuten_p = prob[1], prob[2]
    touten = touten_p > touten_th
    kuten = kuten_p > kuten_th
    if touten and kuten:
        return 1 if touten_p - 0.01 > kuten_p else 2
    elif touten and not kuten:
        return 1
    elif not touten and kuten:
        return 2
    else:
        return 0
' '.join([f'{token}###{to_punct(softmax(logit))}' for token, logit in zip(sent.split(' '), tag_logits2)])

'全て###1 の###2 現実###0 を###0 自分###0 の###0 方###1 に###0 捻じ###1 曲げる###1 性格###0 に###1 育っ###0 て###1 しまっ###0 た###1 誠に###0 遺憾###1 で###1 ある###2'

In [36]:
import numpy as np

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

import torch
from allennlp.data.vocabulary import Vocabulary

from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding

from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.predictors import SentenceTaggerPredictor

from postagger_elmo import PosDatasetReader, LstmTagger

torch.manual_seed(1)

token_indexer = ELMoTokenCharactersIndexer()

reader = PosDatasetReader()

EMBEDDING_DIM = 300
HIDDEN_DIM = 100

# And here's how to reload the model.
vocab2 = Vocabulary.from_files("postagger-bilstm_result10k/vocabulary")

token_embedding = Embedding(num_embeddings=vocab2.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM,
                            pretrained_file="wikipedia_mecab_word2vec/word2vec_glovefmt.txt")
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

lstm: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, bidirectional=True, batch_first=True))


model2 = LstmTagger(word_embeddings, lstm, vocab2)
with open("postagger-bilstm_result10k/model.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))
    
if torch.cuda.is_available():
    cuda_device = 0
else:
    cuda_device = -1
if cuda_device > -1:
    model2.cuda(cuda_device)
predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)

sent = "全て の 現実 を 自分 の 方 に 捻じ 曲げる 性格 に 育っ て しまっ た 誠に 遺憾 で ある"
tag_logits2 = predictor2.predict(sent)['tag_logits']

assert(len(sent.split(' ')) == len(tag_logits2))
[f'{token}###{str(np.argmax(logit))}' for token, logit in zip(sent.split(' '), tag_logits2)]

['全て###0',
 'の###0',
 '現実###0',
 'を###0',
 '自分###0',
 'の###0',
 '方###0',
 'に###0',
 '捻じ###0',
 '曲げる###0',
 '性格###0',
 'に###0',
 '育っ###0',
 'て###0',
 'しまっ###0',
 'た###0',
 '誠に###0',
 '遺憾###0',
 'で###0',
 'ある###0']

In [37]:
[f'{token}###{str(softmax(logit))}' for token, logit in zip(sent.split(' '), tag_logits2)]

['全て###[0.849698   0.08201079 0.06829121]',
 'の###[0.87388894 0.06848654 0.05762452]',
 '現実###[0.88512345 0.06222602 0.05265054]',
 'を###[0.89231312 0.0582539  0.04943298]',
 '自分###[0.896236  0.0560623 0.0477017]',
 'の###[0.89930979 0.05437592 0.04631429]',
 '方###[0.89976951 0.05413136 0.04609912]',
 'に###[0.90053401 0.05371032 0.04575567]',
 '捻じ###[0.90067513 0.05358766 0.04573721]',
 '曲げる###[0.90062654 0.05360678 0.04576668]',
 '性格###[0.90044531 0.05371284 0.04584185]',
 'に###[0.90062746 0.05359266 0.04577988]',
 '育っ###[0.89970542 0.0540344  0.04626019]',
 'て###[0.8988044  0.05450626 0.04668934]',
 'しまっ###[0.89733564 0.05515796 0.0475064 ]',
 'た###[0.89482086 0.05633137 0.04884778]',
 '誠に###[0.89066792 0.05836048 0.0509716 ]',
 '遺憾###[0.88415709 0.06152737 0.05431555]',
 'で###[0.87273862 0.06702554 0.06023584]',
 'ある###[0.85159883 0.07714889 0.07125228]']

In [41]:
def to_punct(prob, touten_th=0.06, kuten_th=0.055):
    touten_p, kuten_p = prob[1], prob[2]
    touten = touten_p > touten_th
    kuten = kuten_p > kuten_th
    if touten and kuten:
        return 1 if touten_p - 0.01 > kuten_p else 2
    elif touten and not kuten:
        return 1
    elif not touten and kuten:
        return 2
    else:
        return 0
' '.join([f'{token}###{to_punct(softmax(logit))}' for token, logit in zip(sent.split(' '), tag_logits2)])

'全て###1 の###1 現実###1 を###0 自分###0 の###0 方###0 に###0 捻じ###0 曲げる###0 性格###0 に###0 育っ###0 て###0 しまっ###0 た###0 誠に###0 遺憾###1 で###2 ある###2'