In [1]:
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Embedding,Input,Reshape,Subtract,Conv2D,MaxPool2D,Concatenate,Flatten
from keras.layers import LSTM
from keras.datasets import imdb
import numpy as np


Using TensorFlow backend.


In [5]:
pretrained_model = '../data/glove.6B.50d.txt'
print('loading embeddings vectors')
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(pretrained_model))

loading embeddings vectors


In [21]:
# HyperParams# HyperP 
max_features = 20000   #学習に用いる単語数
maxlen = 80  #学習に用いる最大長(これ以上は省略する)
batch_size = 32
embed_size = 50  #単語埋め込み次元数(今回は学習済みが50次元なので50次元)

## データの読み込み

In [17]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


### ZeroPadding

In [18]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


## Embedding Matrixの作成

In [19]:
INDEX_FROM=3   

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

In [20]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [25]:
print('create embedding matrix')
embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))

for word, i in word_to_id.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

embedding_matrix.shape

create embedding matrix


(20000, 50)

## Siamese networkの構造

<img src='https://cdn-images-1.medium.com/max/1200/1*XzVUiq-3lYFtZEW3XfmKqg.jpeg' width=400>

## 文書のEncoder部分

In [26]:
out_size=16
filter_sizes = [3,4,5]
num_filters = 32

In [75]:
def make_encoder():
    inputs =Input(shape=(maxlen,))
    emb= Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False)(inputs)  #Embedding層
    reshape = Reshape((maxlen,embed_size,1))(emb)

    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)

    out = Dense(out_size, activation='linear')(flatten)
 
    encoder = Model(inputs, out)
    
    return encoder

# 手法１ 通常のtextCNN

In [90]:
encoder = make_encoder()

inp = Input((maxlen,))
enc = encoder(inp)
out = Dense(1,activation='sigmoid')(enc)
model = Model(inp,out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        (None, 80)                0         
_________________________________________________________________
model_19 (Model)             (None, 16)                1020848   
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 17        
Total params: 1,020,865
Trainable params: 20,865
Non-trainable params: 1,000,000
_________________________________________________________________


In [91]:
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [95]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10, batch_size=32)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f19706b39b0>

## Siameseネットワークの構築

In [97]:
inp1 = Input(shape=(maxlen,))
inp2 = Input(shape=(maxlen,))

encoder1 = make_encoder()
encoder2 = make_encoder()

out1 = encoder1(inp1)
out2 = encoder2(inp2)

x =Concatenate()([out1,out2])

pred = Dense(1,activation='sigmoid')(x)

model = Model([inp1,inp2],pred)

In [98]:
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [99]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_34 (InputLayer)           (None, 80)           0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           (None, 80)           0                                            
__________________________________________________________________________________________________
model_24 (Model)                (None, 16)           1020848     input_34[0][0]                   
__________________________________________________________________________________________________
model_25 (Model)                (None, 16)           1020848     input_35[0][0]                   
__________________________________________________________________________________________________
concatenat

## 学習

In [100]:
# reorganize by groups
train_groups = [x_train[np.where(y_train==i)[0]] for i in np.unique(y_train)]
test_groups = [x_test[np.where(y_test==i)[0]] for i in np.unique(y_train)]
print('train groups:', [x.shape[0] for x in train_groups])
print('test groups:', [x.shape[0] for x in test_groups])

train groups: [12500, 12500]
test groups: [12500, 12500]


In [101]:
def gen_random_batch(in_groups, batch_halfsize = 8):
    out_img_a, out_img_b, out_score = [], [], []
    all_groups = list(range(len(in_groups)))
    for match_group in [True, False]:
        group_idx = np.random.choice(all_groups, size = batch_halfsize)
        out_img_a += [in_groups[c_idx][np.random.choice(range(in_groups[c_idx].shape[0]))] for c_idx in group_idx]
        if match_group:
            b_group_idx = group_idx
            out_score += [1]*batch_halfsize
        else:
            # anything but the same group
            non_group_idx = [np.random.choice([i for i in all_groups if i!=c_idx]) for c_idx in group_idx] 
            b_group_idx = non_group_idx
            out_score += [0]*batch_halfsize
            
        out_img_b += [in_groups[c_idx][np.random.choice(range(in_groups[c_idx].shape[0]))] for c_idx in b_group_idx]
            
    return np.stack(out_img_a,0), np.stack(out_img_b,0), np.stack(out_score,0)

In [102]:
pv_a, pv_b, pv_sim = gen_random_batch(train_groups, 3)

### バッチの確認

In [103]:
for line1,line2,sim in zip(pv_a,pv_b,pv_sim):
    print('元文章1',' '.join(id_to_word[id] for id in line1))
    print('元文章2',' '.join(id_to_word[id] for id in line2))
    print('類似度',sim)
    print('-----------------------------')

元文章1 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> it is so rare that i get to rate a movie without having some reservation as to whether i should have gone up one or down one but this one did the explosion rate a notch higher or one down because my brain hurt trying to create a plot no this one yeah a solid no brainer one ten
元文章2 the acting is below par it features a lot of really annoying rap music and poorly edited fight scenes on the plus side it's got that hispanic bloke in it who stars in every prison action thriller ever made and he shuts a door in this br br it's not very <UNK> but at least it's harmless br br if you were a massive fan of the original it's okay ish stuff br br if not you have been warned
類似度 1
-----------------------------
元文章1 revenge on the <UNK> kids who wronged him plot will of course be familiar to those who've watched it before and those who've seen it before will probably 

## 学習

In [104]:
# make a generator out of the data
def siam_gen(in_groups, batch_size = 32):
    while True:
        pv_a, pv_b, pv_sim = gen_random_batch(train_groups, batch_size//2)
        yield [pv_a, pv_b], pv_sim
        

valid_a, valid_b, valid_sim = gen_random_batch(test_groups, 1024)
loss_history = model.fit_generator(siam_gen(train_groups), 
                                                            steps_per_epoch = 500,
                                                            validation_data=([valid_a, valid_b], valid_sim),
                                                            epochs = 10,
                                                            verbose = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
