Yoon kimのモデル(CNN for sentence classification) chainerで実装

In [26]:
from chainer import ChainList
import chainer.functions as F
import chainer.links as L
import numpy as np
import chainer
import chainer.links as L
from chainer import optimizers, cuda, serializers
import chainer.functions as F
from sklearn.utils import shuffle
import math
from gensim.models import word2vec
import MeCab
import re
word2vecModel = word2vec.Word2Vec.load('/mnt/sdc/wikipedia_data/jawiki_wakati.model')

In [27]:
addDict={}
seq_len=13
categories=6

In [28]:
# リンク数を可変にしたいのでChainListを使用する
class CNNSC(ChainList):
    def __init__(self,input_channel,output_channel,filter_height,
                 filter_width,n_label,max_sentence_len):
        # フィルター数、使用されたフィルター高さ、最大文長は後から使う
        self.cnv_num = len(filter_height)#フィルター数
        self.filter_height = filter_height
        self.max_sentence_len = max_sentence_len
        
        # Convolution層用のLinkをフィルター毎に追加
        # Convolution2D(　入力チャンネル数, 出力チャンネル数（形毎のフィルターの数）, フィルターの形（タプル形式で）, パディングサイズ )
        link_list = [L.Convolution2D(input_channel, output_channel, (i, filter_width), pad=0) for i in filter_height]
        # Dropoff用のLinkを追加
        link_list += [L.Linear(output_channel * self.cnv_num, output_channel * self.cnv_num)]
        # 出力層へのLinkを追加
        link_list += [L.Linear(output_channel * self.cnv_num, n_label)]

        # ここまで定義したLinkのリストを用いてクラスを初期化する
        super(CNNSC, self).__init__(*link_list)
    
    def __call__(self, x):
        # フィルタを通した中間層を準備
        h_conv = [None for _ in self.filter_height]
        h_pool = [None for _ in self.filter_height]
        
        # フィルタ形毎にループを回す
        for i, filter_size in enumerate(self.filter_height):
            # Convolition層を通す
            h_conv[i] = F.relu(self[i](x))
            # Pooling層を通す
            h_pool[i] = F.max_pooling_2d(h_conv[i], (self.max_sentence_len+1-filter_size))
        # Convolution+Poolingを行った結果を結合する
        concat = F.concat(h_pool, axis=2)
        # 結合した結果に対してDropoutをかける
        h_l1 = F.dropout(F.tanh(self[self.cnv_num+0](concat)), ratio=0.5)
        # Dropoutの結果を出力層まで圧縮する
        y = self[self.cnv_num+1](h_l1)

        return y

In [29]:
def train(X_train,Y_train,X_test,Y_test,batch_size,epochs):
    global seq_len
    global categories
    batchsize   = batch_size    # minibatch size
    n_epoch     = epochs        # エポック数
    height=seq_len                  # length of sentences
    width=200                  #size of wordembedding vecteor
    in_units = seq_len
    input_channel = 1
    n_label =  categories# ラベル数
    filter_height = [3,4,5] # フィルタの高さ
    filter_width  = 200 # フィルタの幅 (embeddingの次元数)
    output_channel = 100
    decay = 0.0001 # 重み減衰
    grad_clip = 3  # gradient norm threshold to clip
    max_sentence_len = seq_len # max length of sentences
    N=len(X_train)
    N_test=len(X_test)

    # モデルの定義
    model = CNNSC(input_channel,output_channel,filter_height,
                  filter_width,n_label,max_sentence_len)
    # Setup optimizer
    optimizer = optimizers.AdaDelta()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(decay))
    #gpuつかう
    gpu_device = 0
    cuda.check_cuda_available()
    cuda.get_device(gpu_device).use()
    model.to_gpu(gpu_device)
    xp = cuda.cupy 

    # Learning loop
    for epoch in range(1, n_epoch + 1):

        print('epoch', epoch, '/', n_epoch)

        # training
        perm = np.random.permutation(len(X_train)) #ランダムな整数列リストを取得
        sum_train_loss     = 0.0
        sum_train_accuracy = 0.0
        for i in range(0, N, batchsize):
            
            #perm を使い x_train, y_trainからデータセットを選択 (毎回対象となるデータは異なる)
            x = chainer.Variable(xp.asarray(X_train[perm[i:i + batchsize]])) #source
            t = chainer.Variable(xp.asarray(Y_train[perm[i:i + batchsize]])) #target

            model.zerograds()
            
            y = model(x)
            
            loss = F.softmax_cross_entropy(y, t) # 損失の計算
            accuracy = F.accuracy(y, t) # 正解率の計算

            sum_train_loss += loss.data * len(t)
            sum_train_accuracy += accuracy.data * len(t)

            # 最適化を実行
            loss.backward()
            optimizer.update()

        print('train mean loss={}, accuracy={}'.format(sum_train_loss / N, sum_train_accuracy / N)) #平均誤差

        # evaluation
        sum_test_loss     = 0.0
        sum_test_accuracy = 0.0
        for i in range(0, N_test, batchsize):

            # all test data
            x = chainer.Variable(xp.asarray(X_test[i:i + batchsize]))
            t = chainer.Variable(xp.asarray(Y_test[i:i + batchsize]))
            y = model(x)
        
            loss = F.softmax_cross_entropy(y, t) # 損失の計算
            accuracy = F.accuracy(y, t) # 正解率の計算

            sum_test_loss += loss.data * len(t)
            sum_test_accuracy += accuracy.data * len(t)

        print(' test mean loss={}, accuracy={}'.format(sum_test_loss / N_test, sum_test_accuracy / N_test)) #平均誤差

    return model

In [32]:
def predictVector(word, around_words_list):
    global addDict
    if word in addDict:
        return addDict[word]
    else:
        return addUnknownWord(word,around_words_list)

def addUnknownWord(word , around_words_list):
    global addDict
    rand_vector=np.random.rand(200)/np.linalg.norm(np.random.rand(200))*(10+ 3*np.random.rand(1))
    vector=np.array(word2vecModel[word2vecModel.predict_output_word(around_words_list)[0][0]])+rand_vector
    addDict[word]=vector
    return vector
    
def Wakati(text):
    m = MeCab.Tagger ("-Ochasen -d /usr/lib/mecab/dic/mecab-ipadic-neologd -Owakati")
    result=m.parse(text)
    ws = re.compile(" ")
    words = [word for word in ws.split(result)]
    if words[-1] == u"\n":
        words = words[:-1]
    return [word for word in words if word!="「" and word!="」" and word!="、"and word!="。"
            and word!="!" and word!="?"]

def seq2vecs(words,predict):
    global addDict
    vectors=[]
    for i in range(len(words)):
            try:
                vectors.append(word2vecModel[words[i]])
            except:
                if predict:
                    try:
                        vectors.append(predictVector(words[i],[words[i-1]]))
                    except:
                        if i==0:
                            continue
                        else:
                            similar_word=word2vecModel.similar_by_vector(addDict[words[i-1]], topn=10, restrict_vocab=None)[0][0]
                            vectors.append(predictVector(words[i],[similar_word]))
                else:
                    return []
    return vectors

def train_test_divide(X,Y,test_rate):
    datanum=len(X)
    n=math.floor(datanum*test_rate)
    X_train=np.array(X[:datanum-n])
    Y_train=np.array(Y[:datanum-n])
    X_test=np.array(X[datanum-n:])
    Y_test=np.array(Y[datanum-n:])
    print(X_train.shape)
    print(Y_train.shape)
    print(X_test.shape)
    print(Y_test.shape)
    return (X_train,Y_train),(X_test,Y_test)

def onehot_vector(number):
    global categories
    onehot=np.zeros(categories)
    onehot[number]=1
    return onehot

def load_file(filename):
    ttl=[]
    cat=[]
    with open(filename, "r") as f:
        lines = [line for line in f]
        for line in lines:
            title,category=line.split(" ")
            ttl.append(title)
            cat.append(int(category))
    ttl,cat=shuffle(ttl,cat)
    ttl=np.array(ttl)
    cat=np.array(cat)
    print(ttl.shape)
    print(cat.shape)
    return ttl,cat
def create_data(ttl,cat,predict,sfl):
    X=[]
    Y=[]
    T=[]
    sum=0
    max_len=0
    min_len=1000000
    for i in range(len(ttl)):
        title=ttl[i]
        category=cat[i]
        words=Wakati(title)
        input_vectors=seq2vecs(words,predict)
        sum+=len(input_vectors)
        max_len=max(max_len,len(input_vectors))
        if len(input_vectors) > seq_len:
            input_vectors=input_vectors[:seq_len]
        elif len(input_vectors)==0:
            continue
        min_len=min(min_len,len(input_vectors))
        if sfl:
            random.shuffle(input_vectors)
        x = [ [0.]*200 for _ in range(seq_len) ]
        x[0:len(input_vectors)]=input_vectors
        X.append(np.array([x],dtype="float32"))
        Y.append(int(category))
        T.append(title)
    X=np.array(X,dtype="float32")
    Y=np.array(Y)
    print("-------DataShape------")
    print(X.shape)
    print(Y.shape)
    print(len(T))
    print("-------DataProperties------")
    print("max:"+str(max_len))
    print("min:"+str(min_len))
    print("mean:"+str(sum/len(T)))
    return X,Y,T
def load_dataset(filename,sfl):
    global seq_len
    ttl,cat=load_file(filename)
    if filename=="./data/livedoor_data.txt":
        print("file: livedoor")
        X_test,Y_test,T_test=create_data(ttl,cat,predict=True,sfl=False)
        return X_test,Y_test,T_test
    else:
        print("file: Yahoo")
        (train_ttl,train_cat),(test_ttl,test_cat)=train_test_divide(ttl,cat,0.1)
        X_train,Y_train,T_train=create_data(train_ttl,train_cat,predict=False,sfl=False)
        X_test,Y_test,T_test=create_data(test_ttl,test_cat,predict=True,sfl=False)
        return (X_train,Y_train,T_train),(X_test,Y_test,T_test)
    
(X_train,Y_train,T_train),(X_test,Y_test,T_test)=load_dataset("./data/yahoo_data.txt"
                                                              ,sfl=False)    

(83999,)
(83999,)
file: Yahoo
(75600,)
(75600,)
(8399,)
(8399,)




-------DataShape------
(61464, 1, 13, 200)
(61464,)
61464
-------DataProperties------
max:12
min:2
mean:6.161021085513472


  # This is added back by InteractiveShellApp.init_path()


-------DataShape------
(8399, 1, 13, 200)
(8399,)
8399
-------DataProperties------
max:11
min:2
mean:6.0369091558518875


In [34]:
train(X_train,Y_train,X_test,Y_test,batch_size=30,epochs=5)

epoch 1 / 5
train mean loss=0.7230937, accuracy=0.7646102
 test mean loss=0.69633824, accuracy=0.7834266
epoch 2 / 5
train mean loss=0.53722525, accuracy=0.8279481
 test mean loss=0.6742136, accuracy=0.793785
epoch 3 / 5
train mean loss=0.48018828, accuracy=0.84727645
 test mean loss=0.71178114, accuracy=0.7904512
epoch 4 / 5
train mean loss=0.44130707, accuracy=0.8612196
 test mean loss=0.712445, accuracy=0.79330873
epoch 5 / 5
train mean loss=0.40860924, accuracy=0.8708024
 test mean loss=0.693857, accuracy=0.7992618


<__main__.CNNSC at 0x7f1ed750af98>