In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import os
import math

In [2]:
# Model Hyperparameters
flags=tf.flags

flags.DEFINE_string('word2vec_norm','embeddings/word2vec_norm.txt','Word2vec file with pre-trained embeddings')
flags.DEFINE_string('data_path','SICK','SICK data set path')
flags.DEFINE_string('save_path','SICK/STS_log/','STS model output directory')
flags.DEFINE_integer('embedding_dim',300,'Dimensionality of word embedding')
flags.DEFINE_integer('max_length',26,'one sentence max length words which is in dictionary')
flags.DEFINE_bool('use_fp64',False,'Train using 64-bit floats instead of 32bit floats')

FLAGS=flags.FLAGS
FLAGS._parse_flags()
print('Parameters:')
for attr,value in sorted(FLAGS.__flags.items()):
    print('{}={}'.format(attr,value))

Parameters:
data_path=SICK
embedding_dim=300
max_length=26
save_path=SICK/STS_log/
use_fp64=False
word2vec_norm=embeddings/word2vec_norm.txt


In [3]:
def data_type():
    return tf.float64 if FLAGS.use_fp64 else tf.float32

In [4]:
def build_vocab(word2vec_path=None):
    if word2vec_path:
        print('Load word2vec_norm file {}'.format(word2vec_path))
        with open(word2vec_path,'r') as f:
            header=f.readline()
            vocab_size,layer2_size=map(int,header.split())
            # initial matrix with random uniform
            init_W=np.random.uniform(-0.25,0.25,(vocab_size,FLAGS.embedding_dim))

            print('vocab_size={}'.format(vocab_size))
            dictionary=dict()
            while True:
                line=f.readline()
                if not line:break
                word=line.split()[0]
                dictionary[word]=len(dictionary)
                init_W[dictionary[word]]=np.array(line.split()[1:], dtype=np.float32)

        return dictionary,init_W

In [5]:
def file_to_word2vec_word_ids(filename,word_to_id,is_test=False):
    with open(filename,'r') as f:
        f.readline() # remove header
        sentences_A=[]
        sentencesA_length=[]
        sentences_B=[]
        sentencesB_length=[]
        relatedness_scores=[]
        pairIDs=[]
        while True:
            line=f.readline()
            if not line: break
            ID=line.split('\t')[0] # for test
            pairIDs.append(ID)
            sentence_A=line.split('\t')[1]
            sentence_B=line.split('\t')[2]
            relatedness_score=line.split('\t')[3]    
            
            _=[word_to_id[word] for word in sentence_A.split() if word in word_to_id]
            sentencesA_length.append(len(_)) # must be before [0]*(FLAGS.max_length-len(_))
            _+=[0]*(FLAGS.max_length-len(_))
            sentences_A.append(_)
            
            _=[word_to_id[word] for word in sentence_B.split() if word in word_to_id]
            sentencesB_length.append(len(_))
            _+=[0]*(FLAGS.max_length-len(_))
            sentences_B.append(_)
            
            relatedness_scores.append((float(relatedness_score)-1)/4)
    assert len(sentences_A)==len(sentencesA_length)==len(sentences_B)==len(sentencesB_length)==len(relatedness_scores)
    if not is_test: return STSInput(sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores)
    else:
        stsinput=STSInput(sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores)
        stsinput.pairIDs=pairIDs
        return stsinput

In [6]:
class STSInput(object):
    def __init__(self,sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores):
        self.sentences_A=sentences_A
        self.sentencesA_length=sentencesA_length
        self.sentences_B=sentences_B
        self.sentencesB_length=sentencesB_length
        self.relatedness_scores=relatedness_scores
    
    def sentences_A(self):
        return self.sentences_A
    
    def sentencesA_length(self):
        return self.sentencesA_length
    
    def sentences_B(self):
        return self.sentences_B
    
    def sentencesA_length(self):
        return self.sentencesB_length
    
    def relatedness_scores(self):
        return self.relatedness_scores

In [7]:
train_path=os.path.join(FLAGS.data_path,'SICK_new_train.txt')
valid_path=os.path.join(FLAGS.data_path,'SICK_new_trial.txt')
test_path=os.path.join(FLAGS.data_path,'SICK_test_annotated.txt')

dictionary,init_W=build_vocab(FLAGS.word2vec_norm)
train_data=file_to_word2vec_word_ids(train_path,dictionary)
valid_data=file_to_word2vec_word_ids(valid_path,dictionary,is_test=True)
test_data=file_to_word2vec_word_ids(test_path,dictionary,is_test=True)

Load word2vec_norm file embeddings/word2vec_norm.txt
vocab_size=2378


In [8]:
def next_batch(start,end,input):
    inputs_A=input.sentences_A[start:end]
    inputsA_length=input.sentencesA_length[start:end]
    inputs_B=input.sentences_B[start:end]
    inputsB_length=input.sentencesB_length[start:end]
    labels=np.reshape(input.relatedness_scores[start:end],(-1))
    return STSInput(inputs_A,inputsA_length,inputs_B,inputsB_length,labels)

In [18]:
class Config(object):
    init_scale=0.2
    learning_rate=.01
    max_grad_norm=1.
    keep_prob=1.
    lr_decay=0.98
    batch_size=30
    max_epoch=8
    max_max_epoch=370
    num_layer=2
    
config=Config()
config_gpu = tf.ConfigProto()
config_gpu.gpu_options.allow_growth = True

In [19]:
def build_model(input_,input_length,dropout_):
    rnn_cell=tf.nn.rnn_cell.LSTMCell(num_units=50)
    rnn_cell=tf.nn.rnn_cell.DropoutWrapper(rnn_cell,output_keep_prob=dropout_)
    rnn_cell=tf.nn.rnn_cell.MultiRNNCell([rnn_cell]*config.num_layer)
        
    outputs,last_states=tf.nn.dynamic_rnn(
        cell=rnn_cell,
        dtype=data_type(),
        sequence_length=input_length,
        inputs=input_
    )
    return outputs,last_states

In [20]:
with tf.Graph().as_default():
    initializer=tf.contrib.layers.xavier_initializer()
    
    with tf.variable_scope('Model',initializer=initializer):
        sentences_A=tf.placeholder(tf.int32,shape=([None,FLAGS.max_length]),name='sentences_A')
        sentencesA_length=tf.placeholder(tf.int32,shape=([None]),name='sentencesA_length')
        sentences_B=tf.placeholder(tf.int32,shape=([None,FLAGS.max_length]),name='sentences_B')
        sentencesB_length=tf.placeholder(tf.int32,shape=([None]),name='sentencesB_length')
        labels=tf.placeholder(tf.float32,shape=([None]),name='relatedness_score_label')
        dropout_f=tf.placeholder(tf.float32)
        W=tf.Variable(tf.constant(0.0,shape=[len(dictionary),FLAGS.embedding_dim]),trainable=False,name='W')
        embedding_placeholder=tf.placeholder(data_type(),[len(dictionary),FLAGS.embedding_dim])
        embedding_init=W.assign(embedding_placeholder)

        sentences_A_emb=tf.nn.embedding_lookup(params=embedding_init,ids=sentences_A)
        sentences_B_emb=tf.nn.embedding_lookup(params=embedding_init,ids=sentences_B)

        # model
        with tf.variable_scope('siamese') as scope:
            outputs_A,last_states_A=build_model(sentences_A_emb,sentencesA_length,dropout_f)
            scope.reuse_variables()
            outputs_B,last_states_B=build_model(sentences_B_emb,sentencesB_length,dropout_f)

        # last_states[last_layer][0] cell states, last_states[last_layer][1] hidden states
        prediction=tf.exp(tf.mul(-1.0,tf.reduce_mean(tf.abs(tf.sub(last_states_A[config.num_layer-1][1],last_states_B[config.num_layer-1][1])),1)))
        
        # cost
        cost=tf.reduce_mean(tf.square(tf.sub(prediction, labels)))

        lr=tf.Variable(0.0,trainable=False)
        tvars=tf.trainable_variables()
        grads,_=tf.clip_by_global_norm(tf.gradients(cost,tvars),config.max_grad_norm)
        optimizer=tf.train.AdamOptimizer(learning_rate=lr)
        train_op=optimizer.apply_gradients(zip(grads,tvars),global_step=tf.contrib.framework.get_or_create_global_step())
        new_lr=tf.placeholder(tf.float32,shape=[],name='new_learning_rate')
        lr_update=tf.assign(lr,new_lr)
        
        for v in tf.trainable_variables():
            print(v.name)
        saver = tf.train.Saver()
        
        with tf.Session(config=config_gpu) as sess:
            sess.run(tf.global_variables_initializer())

            total_batch=int(len(train_data.sentences_A)/config.batch_size)
            print('Total batch size: {}, data size: {}, batch size: {}'.format(total_batch,len(train_data.sentences_A),config.batch_size))
            print(config.max_grad_norm,config.keep_prob,config.lr_decay,config.max_epoch,config.max_max_epoch,config.num_layer)
            # train
            prev_train_cost=1
            prev_valid_cost=1
            for epoch in range(config.max_max_epoch):
                lr_decay=config.lr_decay**max(epoch+1-config.max_epoch,0.0)
                sess.run([lr,lr_update],feed_dict={new_lr:config.learning_rate*lr_decay})
                print('Epoch {} Learning rate: {}'.format(epoch,sess.run(lr)))
                
                avg_cost=0.
                for i in range(total_batch):
                    start=i*config.batch_size
                    end=(i+1)*config.batch_size

                    next_batch_input=next_batch(start,end,train_data)
                    _,train_cost,train_predict=sess.run([train_op,cost,prediction],feed_dict={
                            sentences_A:next_batch_input.sentences_A,
                            sentencesA_length:next_batch_input.sentencesA_length,
                            sentences_B:next_batch_input.sentences_B,
                            sentencesB_length:next_batch_input.sentencesB_length,
                            labels:next_batch_input.relatedness_scores,
                            dropout_f:config.keep_prob,
                            embedding_placeholder:init_W
                        })
                    avg_cost+=train_cost
                    
                start=total_batch*config.batch_size
                end=len(train_data.sentences_A)
                if not start==end:
                    next_batch_input=next_batch(start,end,train_data)
                    _,train_cost,train_predict=sess.run([train_op,cost,prediction],feed_dict={
                            sentences_A:next_batch_input.sentences_A,
                            sentencesA_length:next_batch_input.sentencesA_length,
                            sentences_B:next_batch_input.sentences_B,
                            sentencesB_length:next_batch_input.sentencesB_length,
                            labels:next_batch_input.relatedness_scores,
                            dropout_f:config.keep_prob,
                            embedding_placeholder:init_W
                        })
                    avg_cost+=train_cost
                
                if prev_train_cost>avg_cost/total_batch: print('Average cost:\t{} ↓'.format(avg_cost/total_batch))
                else: print('Average cost:\t{} ↑'.format(avg_cost/total_batch))
                prev_train_cost=avg_cost/total_batch
                
                # validation
                valid_cost,valid_predict=sess.run([cost,prediction],feed_dict={
                    sentences_A:valid_data.sentences_A,
                    sentencesA_length:valid_data.sentencesA_length,
                    sentences_B:valid_data.sentences_B,
                    sentencesB_length:valid_data.sentencesB_length,
                    labels:np.reshape(valid_data.relatedness_scores,(-1)),
                    embedding_placeholder:init_W,
                    dropout_f:1.0
                })
                if prev_valid_cost>valid_cost: print('Valid cost:\t{} ↓'.format(valid_cost))
                else: print('Valid cost:\t{} ↑'.format(valid_cost))
                prev_valid_cost=valid_cost
                
            saver.save(sess, FLAGS.save_path+'stslstm-model',global_step=config.max_max_epoch)

            # test
            test_cost,test_predict=sess.run([cost,prediction],feed_dict={
                sentences_A:test_data.sentences_A,
                sentencesA_length:test_data.sentencesA_length,
                sentences_B:test_data.sentences_B,
                sentencesB_length:test_data.sentencesB_length,
                labels:np.reshape(test_data.relatedness_scores,(-1)),
                embedding_placeholder:init_W,
                dropout_f:1.0
            })
            print(test_cost)

Model/siamese/RNN/MultiRNNCell/Cell0/LSTMCell/W_0:0
Model/siamese/RNN/MultiRNNCell/Cell0/LSTMCell/B:0
Model/siamese/RNN/MultiRNNCell/Cell1/LSTMCell/W_0:0
Model/siamese/RNN/MultiRNNCell/Cell1/LSTMCell/B:0
Total batch size: 116, data size: 3500, batch size: 30
1.0 1.0 0.98 8 370 2
Epoch 0 Learning rate: 0.00999999977648
Average cost:	0.0779254449339 ↓
Valid cost:	0.063171826303 ↓
Epoch 1 Learning rate: 0.00999999977648
Average cost:	0.0525201980249 ↓
Valid cost:	0.0538241825998 ↓
Epoch 2 Learning rate: 0.00999999977648
Average cost:	0.0434088677585 ↓
Valid cost:	0.0463258773088 ↓
Epoch 3 Learning rate: 0.00999999977648
Average cost:	0.0365301754229 ↓
Valid cost:	0.0426495932043 ↓
Epoch 4 Learning rate: 0.00999999977648
Average cost:	0.0309100259288 ↓
Valid cost:	0.0398388467729 ↓
Epoch 5 Learning rate: 0.00999999977648
Average cost:	0.0263426312936 ↓
Valid cost:	0.0390106625855 ↓
Epoch 6 Learning rate: 0.00999999977648
Average cost:	0.0231448331575 ↓
Valid cost:	0.0374482870102 ↓
Epoch 7

KeyboardInterrupt: 

In [21]:
with open('SICK/stslstm_trial_result00.txt','w') as fw:
    fw.write('pair_ID	relatedness_score	entailment_judgment\n')
    for _ in range(len(valid_predict)):
        fw.write(valid_data.pairIDs[_]+'\t'+str(valid_predict[_]*4+1)+'\tNA\n')

In [17]:
with tf.Graph().as_default():
    initializer=tf.contrib.layers.xavier_initializer()
    
    with tf.variable_scope('Model',initializer=initializer):
        sentences_A=tf.placeholder(tf.int32,shape=([None,FLAGS.max_length]),name='sentences_A')
        sentencesA_length=tf.placeholder(tf.int32,shape=([None]),name='sentencesA_length')
        sentences_B=tf.placeholder(tf.int32,shape=([None,FLAGS.max_length]),name='sentences_B')
        sentencesB_length=tf.placeholder(tf.int32,shape=([None]),name='sentencesB_length')
        labels=tf.placeholder(tf.float32,shape=([None]),name='relatedness_score_label')
        dropout_f=tf.placeholder(tf.float32)
        W=tf.Variable(tf.constant(0.0,shape=[len(dictionary),FLAGS.embedding_dim]),trainable=False,name='W')
        embedding_placeholder=tf.placeholder(data_type(),[len(dictionary),FLAGS.embedding_dim])
        embedding_init=W.assign(embedding_placeholder)

        sentences_A_emb=tf.nn.embedding_lookup(params=embedding_init,ids=sentences_A)
        sentences_B_emb=tf.nn.embedding_lookup(params=embedding_init,ids=sentences_B)

        with tf.variable_scope('siamese') as scope:
            outputs_A,last_states_A=build_model(sentences_A_emb,sentencesA_length,dropout_f)
            scope.reuse_variables()
            outputs_B,last_states_B=build_model(sentences_B_emb,sentencesB_length,dropout_f)
        
        with tf.Session(config=config_gpu) as sess:
            sess.run(tf.global_variables_initializer())

            
            lsA,outA=sess.run([last_states_A,outputs_A],feed_dict={
                    sentences_A:train_data.sentences_A,
                            sentencesA_length:train_data.sentencesA_length,
                            sentences_B:train_data.sentences_B,
                            sentencesB_length:train_data.sentencesB_length,
                            labels:np.reshape(train_data.relatedness_scores,(-1)),
                            dropout_f:config.keep_prob,
                            embedding_placeholder:init_W
                })

In [23]:
outA[0][6]

array([-0.01372068,  0.01711448,  0.01461041,  0.02663308,  0.0031211 ,
       -0.00600406, -0.02008145,  0.0032562 , -0.01355583, -0.01654267,
        0.03115565, -0.00059169,  0.00396373, -0.00046026, -0.00104474,
       -0.00896356,  0.01196077,  0.01214969, -0.01731708, -0.01633412,
       -0.00594734,  0.00208264,  0.01149419,  0.01027064, -0.00018369,
       -0.00524586,  0.00214519, -0.00606015,  0.01345103, -0.00207044,
       -0.0055078 ,  0.00619595, -0.00406977, -0.01263859,  0.00768601,
       -0.01633211, -0.00712064, -0.02657938,  0.00283206,  0.00101777,
        0.0089191 ,  0.00934759, -0.00278149,  0.0023976 , -0.00082225,
       -0.01662336, -0.00080221,  0.01087493, -0.00255261,  0.00276097], dtype=float32)

In [41]:
lsA[0][1][0]

array([ 0.04136919,  0.08294687,  0.07351208, -0.08565987,  0.02185304,
        0.04141625, -0.03371716, -0.00233871,  0.02308731, -0.11431049,
       -0.05468803, -0.07905278,  0.0547664 , -0.01679092, -0.08914296,
       -0.02394541, -0.02647273,  0.07266507, -0.02347465,  0.04597995,
        0.00866743, -0.00934314,  0.03958961,  0.09414437,  0.02952048,
       -0.05218804, -0.07963059, -0.13497102,  0.06214318,  0.02631334,
       -0.00748384,  0.05903521, -0.00919633,  0.0658776 , -0.03087899,
        0.06256396,  0.0063663 ,  0.03754139,  0.02649744,  0.06941104,
       -0.0424638 ,  0.01122561,  0.06144113, -0.0309658 ,  0.02103541,
        0.04799698, -0.01151128, -0.13741256,  0.02592164, -0.05872881], dtype=float32)