# Chainer_lstm_blstm

- 環境:Ubuntu18.04
- GPU:GeFoce GTX 1080TI
- ドライバー:NVIDIA-SMI 460.32.03, Driver Version: 460.32.03, CUDA Version: 11.2
- 形態素解析:mecab-ipadic-neologd
- 単語のベクトル化 (nwjc2vec):https://www.gsk.or.jp/catalog/gsk2020-d
    - ./:2class_train.txt
    - ./:2class_test.txt

## 事前準備

In [None]:
!pip install chainer
!pip install cupy-cuda101
!pip install lime

In [None]:
# -*- coding: utf-8 -*-

# 必要なライブラリをインポート

from pylab import *
import os
import argparse
import codecs
import random
import numpy as np
import copy
from sklearn.metrics import precision_recall_curve, auc, roc_curve
import matplotlib.pyplot as plt
import re

from gensim.models import KeyedVectors

import chainer
import chainer.optimizers
import chainer.functions as F
import chainer.links as L

from chainer import Variable
from chainer import cuda
from chainer import Chain
from chainer import training
from chainer import reporter
from chainer.training import extensions
from chainer.training import triggers

## 形態素解析（mecab-ipadic-neologd）

In [None]:
!mecab -Owakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd/ < 2class_train.txt > neologd_2class_train.txt

In [None]:
!mecab -Owakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd/ < 2class_test.txt > neologd_2class_test.txt

## NWJC2vecの読み込み

In [None]:
# word2vecの読み込み(word embedding用) 

w2v = KeyedVectors.load_word2vec_format('/home/share/NWJC2vec/nwjc_word_skip_300_8_25_0_1e4_6_1_0_15.txt.vec', binary=False)
#w2v = KeyedVectors.load_word2vec_format('../../Documents/nwjc_word_skip_300_8_25_0_1e4_6_1_0_15.txt.vec', binary=False)

## 引数

In [None]:
# 引数

def argument():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_file', default="2class_useful_train.txt")
    parser.add_argument('--test_file', default="2class_useful_test.txt")
#    parser.add_argument('--train_file', default="shuffle_lstm_mecab_train_typhoon15.txt")
#    parser.add_argument('--test_file', default="shuffle_lstm_mecab_test_typhoon15.txt")
#    parser.add_argument('--train_file', default="lstm_train_typhoon15_neologd.txt")
#    parser.add_argument('--test_file', default="lstm_test_typhoon15_neologd.txt")
    parser.add_argument('--mode', default="BLSTM") # LSTMかBLSTMか
    parser.add_argument('--n_layers', default=1, type=int) # 層の数 BLSTMなら1で2つ分(ForwardとBackward)
    parser.add_argument('--batchsize', default=2, type=int) # batchsize(学習に使うデータ数)
    parser.add_argument('--embed', default=300, type=int) # w2vのベクトルの次元数
    parser.add_argument('--hidden', default=300, type=int) # ユニット数
#    parser.add_argument('--epoch', default=300, type=int) # epoch数(学習回数)
    parser.add_argument('--epoch', default=20, type=int) # epoch数(学習回数)
    parser.add_argument('--alpha', default=0.001, type=float) # 学習率
    parser.add_argument('--drop', default=0.5, type=float) # dropout
    parser.add_argument('--clipping', default=5.0, type=float) # 勾配の上限を設定
    parser.add_argument('--save_dir', default='./result_blstm') # 保存するディレクトリ名
    parser.add_argument('--model', default="ty15") # 結果を保存するファイル名(topic)
    parser.add_argument('--classes', default=2, type=int) # 分類する数
    parser.add_argument('--use_gpu', default=1, type=int) # gpuの番号0-3
# 使用しないパラメータ
#    parser.add_argument('--maxlen', default=30, type=int) # 最長の長さ
#    parser.add_argument('--vocab', default=3000, type=int) # 単語数
#    parser.add_argument('--unchain', action='store_true', default=False) # 
#    args = parser.parse_args()
    args = parser.parse_args(args=[])
    return args

In [None]:
args = argument() # 引数

## 結果のディレクトリ作成

In [None]:
# 結果ディレクトリ作成
if os.path.isdir(args.save_dir) == False:
    os.mkdir(args.save_dir)

## 対象ファイル名メモ

In [None]:
### load_data train   ***************************************
#fname_tr="../LSTM/LSTM_same_wakati.txt" #all
#fname_tr="../LSTM/LSTM_same_train_wakati.txt"
#fname_tr="../LSTM/lstm_ooame_train_mecab.csv"
#fname_tr="../LSTM/lstm_taifu_train_mecab.csv"
#fname_tr="../LSTM/Netyu_LSTM_same_train_wakati.txt"
#fname_tr="../LSTM/Netyu_LSTM_same_wakati.txt" #all
#fname_tr="../LSTM/lstm_mecab_train_typhoon15_300.txt"

### load_data test  *************************************
#fname="../LSTM/lstm_earthquake_test_mecab.csv"
#fname="../LSTM/lstm_ooame_test_mecab.csv"
#fname="../LSTM/lstm_taifu_test_mecab.csv"
#fname="../LSTM/lstm_mecab_test_typhoon15_30.txt"

## GPU or CPUの指定

In [None]:
# GPUの指定
if 0 <= args.use_gpu <= 3:
    xp = cuda.cupy
    cuda.get_device(args.use_gpu).use()
else:
    xp = np

## ランダムのシード値の固定

In [None]:
# 同じ結果を出力するために乱数シードを固定

def reset_seed(seed=0):
    random.seed(seed)
    xp.random.seed(seed)
    if cuda.available:
        cuda.cupy.random.seed(seed)

In [None]:
reset_seed(0) # 乱数シード固定

## LSTMモデルの構築

In [None]:
# LSTMのモデル構築

class LSTMBase(Chain):
    """
    Args:
        hx: Initial hidden states
        cx: Initial cell states
        xs: List of input sequences
    
    Returns:
        hy: updated hidden states whose shape is the same as ``hx``
                hxが更新されたもの
        cy: updated cell states whose shape is the same as ``cx``
                cxが更新されたもの
        ys: ``ys[i]`` holds hidden states of the last layer corresponding to an input ``xs[i]``
                各要素 `` ys [i] ``は入力 `` xs [i] ``に対応する最後の隠れ層の状態
    """
    def __init__(self, n_layers, embed_size, hidden_size, n_labels=2, dropout=0.5):
        super(LSTMBase, self).__init__(
            lstm=L.NStepLSTM(n_layers, embed_size, hidden_size, dropout),
            linear=L.Linear(hidden_size, n_labels),
        )

    def reset_state(self):
        self.cx = self.hx = None
        
    def __call__(self, xs):
        self.reset_state()
        hy, cy, ys  = self.lstm(self.hx, self.cx, xs)
        #y = self.linear(F.dropout(hy[0]))
        y = self.linear(hy[0])

        return y

## BLSTMモデルの構築

In [None]:
# BLSTMのモデル構築

class BLSTMBase(Chain):
    """
    Args:
        hx: Initial hidden states
        cx: Initial cell states
        xs: List of input sequences
    
    Returns:
        hy: updated hidden states whose shape is the same as ``hx``
                hxが更新されたもの
        cy: updated cell states whose shape is the same as ``cx``
                cxが更新されたもの
        ys: ``ys[i]`` holds hidden states of the last layer corresponding to an input ``xs[i]``
                各要素 `` ys [i] ``は入力 `` xs [i] ``に対応する最後の隠れ層の状態
    """
    def __init__(self, n_layers, embed_size, hidden_size, n_labels=2, dropout=0.0):
        super(BLSTMBase, self).__init__(
            bi_lstm=L.NStepBiLSTM(n_layers, embed_size, hidden_size, dropout),
            linear=L.Linear(hidden_size * 2, n_labels),
        )
        
    def reset_state(self):
        self.cx = self.hx = None
        
    def __call__(self, xs):
        self.reset_state()
        hy, cy, ys  = self.bi_lstm(self.hx, self.cx, xs)
#        y = self.linear(F.dropout(F.concat([hy[0], hy[1]])))
#        y = self.linear(F.dropout(hy[0]+hy[1])) # sum
        y = self.linear(F.concat([hy[0], hy[1]])) # concat
#        y = self.linear(F.dropout((hy[0]+hy[1])/2)) # average

        return y

## updaterの設定

In [None]:
# このdeviceを受け取った時点でdeviceが@numpyというcpu.device型になっている
class original_updater(training.StandardUpdater):
    def __init__(self, train_iter, optimizer, device):
        super(original_updater, self).__init__(
            train_iter, optimizer, converter=convert, device=device)

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator('main')
        optimizer = self.get_optimizer('main')

        # Progress the dataset iterator for bprop_len words at each iteration.
        # Get the next batch (a list of tuples of two word IDs)
        batch = train_iter.__next__()
        
        # Concatenate the word IDs to matrices and send them to the device
        # self.converter does this job
        # (it is chainer.dataset.concat_examples by default)
        xs, ts = self.converter(batch)
        
        # Compute the loss at this time step and accumulate it
        loss = optimizer.target([x for x in xs], ts)

        optimizer.target.cleargrads()  # Clear the parameter gradients
        loss.backward()  # Backprop
        optimizer.update()  # Update the parameters

## 損失関数(soft_max_cross_entropy)の設定

In [None]:
def sum_softmax_cross_entropy(ys, ts):
    loss = 0
    
    loss = F.softmax_cross_entropy(ys, ts)
    return loss

## 学習するためのデータに変換

In [None]:
# [(x, t), (x, t) ...] -> ( [x, x, ...], [t, t, t])
def convert(batch):
    return tuple(([x for x, _ in batch], xp.array([y for _, y in batch])))

## Evaluaterの設定(既存ではTypeErrorのため対応)

In [None]:
class original_Evaluator(training.extensions.Evaluator):

    def __init__(self, iterator, target, device):
        super(original_Evaluator, self).__init__(
            iterator, target, converter=convert, device=device)

    def evaluate(self):
        iterator = self._iterators['main']
        target = self._targets['main']
        eval_func = self.eval_func or target

        if self.eval_hook:
            self.eval_hook(self)
        it = copy.copy(iterator)
        summary = reporter.DictSummary()

        for batch in it:
            observation = {}
            with reporter.report_scope(observation):
                xs, ts = self.converter(batch)
                with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
                    eval_func([xp.array(x) for x in xs], ts)

            summary.add(observation)

        return summary.compute_mean()

## ファイルからデータの読み込み

In [None]:
# train,testデータの読み込み

def load_data(fname):
    print ('input file name:', fname)

    """
    文書リストを作成(一致させる)
    ex) document_list = [[word, word, ... , word], [word, ... , word], ... ]
           target = [label, label, ... , label]
    """
    target = [] #ラベル
    source = [] #文書ベクトル
    document_list = []
    
    fr = codecs.open(fname, "r", "utf-8", "ignore")
    doc = fr.readlines()
    fr.close()
    
    # 読み込んだデータをrandomする
    # random.shuffle(doc)
    
    for l in doc:
        sample = l.strip().split(' ',  1)
        label = sample[0]
        try:
            document_list.append(sample[1]) #文書ごとの単語リスト
            target.append(label) #ラベル
        except:
            print("load_data is error::  {} Line".format(cnt))
    return document_list,target

## 文章を単語単位に分割

In [None]:
# 文章を単語に区切る

def sentence2words(sentence):
    sentence = sentence.replace("\n", "").replace("\r","") # 改行削除
#    sentence = re.sub(re.compile(r"[!-\/:-@[-`{-~]"), "", sentence) # 記号削除 #をスペースに置き換え
    sentence = sentence.split(" ") # スペースで区切る
    sentence_words = []
    for word in sentence:
#        if (re.compile(r"^.*[0-9]+.*$").fullmatch(word) is not None): # 数字が含まれるものは除外
#            continue
        sentence_words.append(word)
    return sentence_words

## 単語のベクトル化

In [None]:
# 単語のベクトル化 (["地震","は"...] → tensor[tensor[],tensor[],...])

def To_vec(args, line, xp):
    x_batch = []
    for x in line:
        try:
            x_batch.append(xp.array(w2v[x], dtype=xp.float32))
        except:
            if x==-1:
                x_batch.append(xp.array([-1.0]*args.embed, dtype=xp.float32))
            elif x==u"NuLLL":
                x_batch.append(xp.array([-100.0]*args.embed, dtype=xp.float32))
            else:
                x_batch.append(xp.array([0.0]*args.embed, dtype=xp.float32))
                
    return x_batch

## 学習

In [None]:
# 学習

def train():
    print("-------Run-Training-------")
    print("\n")
    print(args)
    print("\n")
    print("---load_data---")
    # ファイルの読み込み(train)
    fname_train=args.train_file
    train_x, train_t= load_data(fname_train)

    print("---make_dataset---")
    # 文章を単語のリストにしてベクトル化(train)
    train_x_vec=[] # 文書
    for line_sentence in train_x:
        words_list_train = sentence2words(line_sentence) # str"word word ... word" → list[word, word, ... , word]
        sentence_vector_list_train = To_vec(args, words_list_train, xp) # list[word, word, ... , word] → np.array[[vector], [vector], ... , [vector]]
        train_x_vec.append(xp.array(sentence_vector_list_train)) # np.array[[vector], [vector], ... , [vector]] → np.array[ [[vector], [vector], ... , [vector]], [[vector], [vector], ... , [vector]] ... ] ]
    train_t = xp.array(train_t, dtype="int32") # ラベル

    # (データ, ラベル)のタプルデータセットを作成
    dataset = chainer.datasets.TupleDataset(train_x_vec,train_t)
    
    # 学習データとバリデーションデータに分割
    split_at = int(len(dataset) * 0.75) # 7.5:2.5に分割
    train_data = dataset[0:split_at]
    vali_data = dataset[split_at:len(dataset)]

    # 使うデータセット
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize, repeat=True, shuffle=True)
    vali_iter = chainer.iterators.SerialIterator(vali_data, args.batchsize, repeat=False, shuffle=False)
    
    print("---load_model---")
    print("      {}".format(args.mode))
    # 使用モデル(デフォルトでsoftmax_cross_entropy，loss_funcで損失関数を指定できる)
    # Classifierでmodelをラップすることで、modelに損失の計算プロセスを追加します。 
    # 引数に損失関数を指定しない場合は、softmax_cross_entropyを使います。 
    if args.mode == "LSTM":
        lstm = LSTMBase(args.n_layers, args.embed, args.hidden, args.classes, args.drop)
    elif args.mode == "BLSTM":
        lstm = BLSTMBase(args.n_layers, args.embed, args.hidden, args.classes, args.drop)
    else:
        raise Exception("Input mode \"LSTM\" or \"BLSTM\".")
    m = L.Classifier(lstm, lossfun=sum_softmax_cross_entropy)

    # モデルに対してGPU使用
    if 0 <= args.use_gpu <= 3:
        m.to_gpu()
        print("---use_gpu_{}---".format(args.use_gpu))
    else:
        print("---use_cpu---")
    
    print("---optimizer_Adam---")
    # 学習
    # Optimizer
    opt = chainer.optimizers.Adam(args.alpha) # 選択
    opt.setup(m) # modelセット
    opt.add_hook(chainer.optimizer.GradientClipping(args.clipping))# 勾配の上限を設定
    
    print("---updater_standardupdater---")
    # Updater originalを指定
    updater = original_updater(train_iter, opt, device=args.use_gpu)
    
    print("---training---")
    snapshot_name = '{}_snapshot_epoch-'.format(args.model)
    model_name = '{}_model_epoch-'.format(args.model)
    loss_png = '{}_loss.png'.format(args.model)
    acc_png = '{}_accuracy.png'.format(args.model)
    # Trainer
    # trainerの宣言
    trainer = training.Trainer(updater,(args.epoch,'epoch'),out=args.save_dir)
    # 学習の経過をtrainerのoutで指定したフォルダにlogというファイル名で記録する
    trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
    # Test-accuracy が更新されたときにモデルを保存する
    # 定期的に状態をシリアライズ(保存)する
    trigger = triggers.MaxValueTrigger('validation/main/accuracy', trigger=(1, 'epoch'))
    trainer.extend(extensions.snapshot(filename=snapshot_name+'{.updater.epoch}'))
    trainer.extend(extensions.snapshot_object(m, filename=model_name+'{.updater.epoch}'), trigger=trigger)
    # バリデーションデータを使ってモデルの評価を行う
    trainer.extend(original_Evaluator(vali_iter, m, device=args.use_gpu))
    # 損失関数の値をグラフにする
    trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name=loss_png))
    # 正答率をグラフにする
    trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name=acc_png))
    # １エポックごと（trigger）に、trainデータに対するlossと、testデータに対するloss、経過時間（elapsed_time）を標準出力させる
    trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time']))
    # 進捗bar(今何%的な)
    trainer.extend(extensions.ProgressBar())
    # グラフをDOT Languageで描画してくれる
    trainer.extend(extensions.dump_graph('main/loss'))
    
    # 実行
    trainer.run()
    
    # 最終的な学習済みモデルを保存
    chainer.serializers.save_npz("{}/{}_mymodel.npz".format(args.save_dir, args.model), m)

    print("--------Finish Training!----------")
    print("\n")

## 検証

In [None]:
predict_array = [] # LIME用 配列

# 評価
def predict():
    print("-------Run-Predicting-------")
    print("\n")
    print(args)
    print("\n")
    print("---load_data---")
    # ファイルの読み込み(test)
    fname_test=args.test_file
    test_x, test_t= load_data(fname_test)

    print("---make_dataset---")
    # 文章を単語のリストにしてベクトル化(test)
    test_x_vec=[]# 文書
    for line_sentence in test_x:
        words_list_test = sentence2words(line_sentence) # str"word word ... word" → list[word, word, ... , word]
        sentence_vector_list_test = To_vec(args, words_list_test, xp) # list[word, word, ... , word] → np.array[[vector], [vector], ... , [vector]]
        test_x_vec.append(xp.array(sentence_vector_list_test)) # np.array[[vector], [vector], ... , [vector]] → np.array[ [[vector], [vector], ... , [vector]], [[vector], [vector], ... , [vector]] ... ] ]
    test_t = xp.array(test_t, dtype="int32") # ラベル
    
    # (データ, ラベル)のタプルデータセットを作成
    #dataset = chainer.datasets.TupleDataset(test_x_vec, test_t)
    
    # 使うデータセット
    #test_iter = chainer.iterators.SerialIterator(dataset, args.batchsize, repeat=True, shuffle=True)

    print("---load_model---")
    print("      {}".format(args.mode))
    # 使用モデル(デフォルトでsoftmax_cross_entropy，loss_funcで損失関数を指定できる)
    # Classifierでmodelをラップすることで、modelに損失の計算プロセスを追加します。 
    # 引数に損失関数を指定しない場合は、softmax_cross_entropyを使います。 
    if args.mode == "LSTM":
        lstm = LSTMBase(args.n_layers, args.embed, args.hidden, args.classes, args.drop)
    elif args.mode == "BLSTM":
        lstm = BLSTMBase(args.n_layers, args.embed, args.hidden, args.classes, args.drop)
    else:
        raise Exception("Input mode \"LSTM\" or \"BLSTM\".")
    m = L.Classifier(lstm, lossfun=sum_softmax_cross_entropy)
    
    # モデルに対してGPU使用
    if 0 <= args.use_gpu <= 3:
        m.to_gpu()
        print("---use_gpu_{}---".format(args.use_gpu))
    else:
        print("---use_cpu---")
    
    print("---load_trained_model---")
    # 学習データを読み込む
#    use_model = "/{}_snapshot_epoch-300".format(args.model)
    load_model = "{}/{}_mymodel.npz".format(args.save_dir, args.model)
    #chainer.serializers.load_npz(load_model, m, strict=False, path='updater/model:main/predictor/') # snapshot を読み込む場合はpathの設定が必要(snapshotは色々保存されてるから)
    chainer.serializers.load_npz(load_model, m) # snapshot を読み込む場合はpathの設定が必要(snapshotは色々保存されてるから)
    
    # 結果
    pos_num = 0
    neg_num = 0
    TP=0
    FN=0
    FP=0
    TN=0

    judged=0
    cnt=0
    # auc_roc curve
    y_test=list() # ラベル
    prob=list() # +の確率
    
    # 結果の出力(ラベルとtweet)
    tweet_data = codecs.open("{}/res_all_{}".format(args.save_dir, fname_test).replace(".txt",".csv"),'w', 'sjis', 'ignore')
    # 結果の出力(AUC)
    tweet = codecs.open("{}/res_auc_{}".format(args.save_dir, fname_test).replace(".txt",".csv"), 'w', 'sjis', 'ignore')
    
    print("---predict---\n")
    # 1行ずつ評価
    for x,t in zip(test_x_vec, test_t):
        xs = []
        xs.append(x)
        # 評価
        # モデルのforward関数に渡す
        with chainer.using_config('train', False), chainer.using_config('enable_backprop', False): # 学習しない設定にして無駄な計算を省く
            y = m.predictor(xs)

        # 1 or 0を出力
        output = F.softmax(y, axis=1)

        # 結果集計・書き込み
        judged=xp.argmax(output.data)
        tmp=list(output.data)
        judged=xp.argmax(tmp[0])

        tweet_data.write("{},{},{},{},{}\n".format(str(t), str(judged), str(tmp[0][1]), str(tmp[0][0]), test_x[cnt].replace(' ', '')))
        tweet.write("{},{},{}\n".format(str(t), str(tmp[0][1]), str(tmp[0][0])))
        predict_array.append(tmp[0]) # LIME用
        
        y_test.append(int(str(t))) # AUC 実際のラベル
        prob.append(float(tmp[0][1])) # AUC+の値

        if(judged==1 and t==1):
            pos_num += 1
            TP=TP+1
        if(judged==1 and t==0):
            neg_num += 1
            FP=FP+1
        if(judged==0 and t==1):
            pos_num += 1
            FN=FN+1
        if(judged==0 and t==0):
            neg_num += 1
            TN=TN+1
        cnt=cnt+1
        if cnt % 100 == 0:
            print("test_data:  {} / {}".format(cnt, len(test_x)))
        
    tweet.close()
    tweet_data.close()
    print("ok")

    # 最終的な結果の出力
    #適合率
    if(TP+FN)!=0:
        precision = (TP / (TP+FP))
    else:
        precision = 0
    print(u"適合率：" + str(precision))

    # 再現率
    if(TP+FN)!=0:
        recall = TP / (TP+FN)
    else:
        recall = 0
    print(u"再現率："+str(recall))

    # F値
    if (TP+FN) !=0 and (TP+FN) !=0:
        f_measure = (2*precision*recall) / (precision+recall)
    else:
        f_measure = 0
    print(u"F値："+str(f_measure))

    # 精度
    accuracy = (TP+TN) / (TP+FP+TN+FN)
    print(u"精度"+str(accuracy))

    # AUC Curve
    y_test=np.array(y_test)
    prob=np.array(prob)
    fpr, tpr, thresholds= roc_curve(y_test, prob ,pos_label=1)

    plt.figure(figsize=(6,6))
    plt.plot( fpr,tpr)
    plt.title("ROC curve : "+args.model)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.grid(which='major',color='black',linestyle='-')
    plt.grid(which='minor',color='black',linestyle='-')

    # AUCの算出
    #precision, recall, thresholds = precision_recall_curve(y_test, prob,pos_label=1)
    #print("precision: {0}, recall: {1}, thresholds: {2}".format(precision, recall, thresholds))
    area = auc(fpr, tpr)
    print("Area Under Curve: {0:.5f}".format(area))
    plt.savefig("{}/{}.png".format(args.save_dir, area))

    print("Eval data.")
    print("        判定結果")
    print("         1|    0")
    print("正解  1|"+str(TP)+"  "+str(FN))
    print("      0|"+str(FP)+"  "+str(TN))

    fw = codecs.open("{}/res_confusion_{}".format(args.save_dir, fname_test).replace(".txt",".csv"), "w", "sjis", "ignore")
    fw.write("データ数,正例,負例,適合率,再現率,F値,精度\n")
    fw.write("{0},{1},{2},{3},{4},{5},{6}\n".format(TP+FN+FP+TN, pos_num, neg_num, precision, recall, f_measure, accuracy))
    fw.write("\n")
    fw.write(",LSTM_pos,LSTM_neg\n")
    fw.write("Correct_pos,{0},{1}\n".format(TP, FN))
    fw.write("Correct_neg,{0},{1}\n".format(FP,TN))

    fw.close()
    print("--------Finish predict!----------")    
    print("\n")

In [None]:
# main

def main():
    train()
    predict()

## 実行

In [None]:
if __name__ == '__main__':
    main()

## LIME
- chainer, w2vに対応させるために変更

In [None]:
from __future__ import unicode_literals
import seaborn as sns
%matplotlib inline
from collections import OrderedDict
from lime.lime_text import LimeTextExplainer, IndexedString, IndexedCharacters, TextDomainMapper
from lime import explanation, lime_base
from functools import partial
import itertools
import json
import re
import numpy as np
import scipy as sp
import sklearn
from sklearn.utils import check_random_state
import pandas as pd

class w2v_Lime(LimeTextExplainer):
    def __init__(self, class_names=None, split_expression=None, bow=False, random_state=0):
        super(w2v_Lime, self).__init__(
#            kernel_width=25,
#             kernel=None,
#             verbose=False,
             class_names=class_names,
#             feature_selection='auto',
             split_expression=split_expression,
             bow=bow,
#             mask_string=None,
             random_state=random_state,
#             char_level=False)
        )
    
    def explain_instance(self,
                     text_instance,
                     classifier_fn,
                     labels=(1,),
                     top_labels=None,
                     num_features=10,
                     num_samples=5000,
                     distance_metric='cosine',
                     model_regressor=None):

        # LIME用のindexを作成
        indexed_string = (IndexedCharacters(
            text_instance, bow=self.bow, mask_string=self.mask_string)
                          if self.char_level else
                          IndexedString(text_instance, bow=self.bow,
                                        split_expression=self.split_expression,
                                        mask_string=self.mask_string))
        domain_mapper = TextDomainMapper(indexed_string)
        
        # データdataと評価時のsoftmaxの値yssとコサイン類似度の距離distancesを取得
        data, yss, distances = self.__data_labels_distances(
            indexed_string, classifier_fn, num_samples,
            distance_metric=distance_metric)
        
        # class_nameが無ければ適当に決定する
        if self.class_names is None:
            self.class_names = [str(x) for x in range(yss[0].shape[0])]
            
        # Explanation class, with visualization functions. の作成
        ret_exp = explanation.Explanation(domain_mapper=domain_mapper,
                                          class_names=self.class_names,
                                          random_state=self.random_state)
        ret_exp.predict_proba = yss[0]
        if top_labels:
            labels = np.argsort(yss[0])[-top_labels:]
            ret_exp.top_labels = list(labels)
            ret_exp.top_labels.reverse()
        for label in labels:
            (ret_exp.intercept[label],
             ret_exp.local_exp[label],
             ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data(
                data, yss, distances, label, num_features,
                model_regressor=model_regressor,
                feature_selection=self.feature_selection)
        return ret_exp
    
    #データdataと評価時のsoftmaxの値yssとコサイン類似度の距離distancesを取得
    def __data_labels_distances(self,
                            indexed_string,
                            classifier_fn,
                            num_samples,
                            distance_metric='cosine'):
        def distance_fn(x):
            return sklearn.metrics.pairwise.pairwise_distances(
                x, x[0], metric=distance_metric).ravel() * 100

        doc_size = indexed_string.num_words()
        sample = self.random_state.randint(1, doc_size + 1, num_samples - 1)
        data = np.ones((num_samples, doc_size))
        data[0] = np.ones(doc_size)
        features_range = range(doc_size)
        inverse_data = [indexed_string.raw_string()]
        for i, size in enumerate(sample, start=1):
            inactive = self.random_state.choice(features_range, size,
                                                replace=False)
            data[i, inactive] = 0
            inverse_data.append(indexed_string.inverse_removing(inactive))
            
        # 既存からの変更点
        # 文章を単語のリストにしてベクトル化(test)
        inverse_data_2 = []# 文書
        #print(inverse_data)
        for line_sentence in inverse_data:
            words_list_test = sentence2words(line_sentence) # str"word word ... word" → list[word, word, ... , word]
            sentence_vector_list_test = To_vec(args, words_list_test, xp) # list[word, word, ... , word] → np.array[[vector], [vector], ... , [vector]]
            inverse_data_2.append(xp.array(sentence_vector_list_test)) # np.array[[vector], [vector], ... , [vector]] → np.array[ [[vector], [vector], ... , [vector]], [[vector], [vector], ... , [vector]] ... ] ]
        with chainer.using_config('train', False), chainer.using_config('enable_backprop', False): # 学習しない設定にして無駄な計算を省く
            output = F.softmax(classifier_fn.predictor(inverse_data_2), axis=1)
        labels = chainer.cuda.to_cpu(output.data) # Variableからnp.arrayへ
#        labels[0][0], labels[0][1]= labels[0][1], labels[0][0] # 逆順
#        print("labels:\n{}".format(labels))
        distances = distance_fn(sp.sparse.csr_matrix(data))

        return data, labels, distances

## 実行

In [None]:
def Lime():
    sav_dir = "lime_html_lstm_shuffle"
    # 結果ディレクトリ作成
    if os.path.isdir(sav_dir) == False:
        os.mkdir(sav_dir)

    print("-------Run-LIME-------")
    print("\n")
    print("---load_data---")
    # ファイルの読み込み(test)
    fname_test=args.test_file
    test_x, test_t= load_data(fname_test)

    print("---load_model---")
    print("      {}".format(args.mode))
    # 使用モデル(デフォルトでsoftmax_cross_entropy，loss_funcで損失関数を指定できる)
    # Classifierでmodelをラップすることで、modelに損失の計算プロセスを追加します。 
    # 引数に損失関数を指定しない場合は、softmax_cross_entropyを使います。 
    if args.mode == "LSTM":
        lstm = LSTMBase(args.n_layers, args.embed, args.hidden, args.classes, args.drop)
    elif args.mode == "BLSTM":
        lstm = BLSTMBase(args.n_layers, args.embed, args.hidden, args.classes, args.drop)
    else:
        raise Exception("Input mode \"LSTM\" or \"BLSTM\".")
    m = L.Classifier(lstm, lossfun=sum_softmax_cross_entropy)

    # モデルに対してGPU使用
    if 0 <= args.use_gpu <= 3:
        m.to_gpu()
        print("---use_gpu_{}---".format(args.use_gpu))
    else:
        print("---use_cpu---")

    print("---load_trained_model---")
    # 学習データを読み込む
    load_model = "{}/{}_mymodel.npz".format(args.save_dir, args.model)
    chainer.serializers.load_npz(load_model, m) # snapshot を読み込む場合はpathの設定が必要(snapshotは色々保存されてるから)

    # We choose a sample from test set
#    idx = 5 # 最終的には全て行う
    for idx in range(len(test_x)):
#    for idx in range(2):
    
        text_sample = test_x[idx] # 元データならtest_x(1-line 1-tweet)
        class_names = ['0', '1']

        # Lime
        explainer = w2v_Lime(class_names=class_names, split_expression=r'\W+', bow=False, random_state=0)
        explanation = explainer.explain_instance(text_sample, m, num_features=1000, num_samples=10000)

        weights = OrderedDict(explanation.as_list()) # 順列辞書にリストにする関数を
    #    print(explanation.as_list())
    #    lime_weights = pd.DataFrame({'words': list(weights.keys()), 'weights': list(weights.values())})

    #    sns.barplot(x="words", y="weights", data=lime_weights);
    #    plt.xticks(rotation=1)
    #    plt.title('Tweet No.{} features weights given by LIME'.format(idx+1));
#        explanation.show_in_notebook(text=True)
        explanation.save_to_file(file_path="./{}/save_lstm_10000_{}.html".format(sav_dir, idx+1))

In [None]:
Lime()

---

## ＊結果

shuffle data (文脈考慮できているかどうか)

In [None]:
if __name__ == '__main__':
    main()

replace mark

In [None]:
if __name__ == '__main__':
    main()

neologd

In [None]:
if __name__ == '__main__':
    main()

epoch 20 Dropout 0.5

In [None]:
if __name__ == '__main__':
    main()

The best

In [None]:
if __name__ == '__main__':
    main()