In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import time
import os
import math

In [2]:
# Model Hyperparameters
flags=tf.flags

flags.DEFINE_string('word2vec_norm','embeddings/word2vec_norm.txt','Word2vec file with pre-trained embeddings')
flags.DEFINE_string('data_path','SICK','SICK data set path')
flags.DEFINE_string('save_path','SICK/STS_log','STS model output directory')
flags.DEFINE_integer('embedding_dim',300,'Dimensionality of word embedding')
flags.DEFINE_integer('max_length',26,'one sentence max length words which is in dictionary')
flags.DEFINE_bool('use_fp64',False,'Train using 64-bit floats instead of 32bit floats')

FLAGS=flags.FLAGS
FLAGS._parse_flags()
print('Parameters:')
for attr,value in sorted(FLAGS.__flags.items()):
    print('{}={}'.format(attr,value))

Parameters:
data_path=SICK
embedding_dim=300
max_length=26
save_path=SICK/STS_log
use_fp64=False
word2vec_norm=embeddings/word2vec_norm.txt


In [3]:
def data_type():
    return tf.float64 if FLAGS.use_fp64 else tf.float32

In [4]:
def build_vocab(word2vec_path=None):
    if word2vec_path:
        print('Load word2vec_norm file {}'.format(word2vec_path))
        with open(word2vec_path,'r') as f:
            header=f.readline()
            vocab_size,layer2_size=map(int,header.split())
            # initial matrix with random uniform
            init_W=np.random.uniform(-0.25,0.25,(vocab_size,FLAGS.embedding_dim))

            print('vocab_size={}'.format(vocab_size))
            dictionary=dict()
            while True:
                line=f.readline()
                if not line:break
                word=line.split()[0]
                dictionary[word]=len(dictionary)
                init_W[dictionary[word]]=np.array(line.split()[1:], dtype=np.float32)

        return dictionary,init_W

In [5]:
def file_to_word_ids(filename,word_to_id):
    with open(filename,'r') as f:
        f.readline() # remove header
        sentences_A=[]
        sentencesA_length=[]
        sentences_B=[]
        sentencesB_length=[]
        relatedness_scores=[]
        while True:
            line=f.readline()
            if not line: break
            pair_ID=line.split('\t')[0] # for trial & test
            sentence_A=line.split('\t')[1]
            sentence_B=line.split('\t')[2]
            relatedness_score=line.split('\t')[3]    
            _=[word_to_id[word] for word in sentence_A.split() if word in word_to_id]
            _+=[0]*(FLAGS.max_length-len(_))
            sentences_A.append(_)
            sentencesA_length.append(len(_))
            _=[word_to_id[word] for word in sentence_B.split() if word in word_to_id]
            _+=[0]*(FLAGS.max_length-len(_))
            sentences_B.append(_)
            sentencesB_length.append(len(_))
            relatedness_scores.append((float(relatedness_score)-1)/4)
    assert len(sentences_A)==len(sentencesA_length)==len(sentences_B)==len(sentencesB_length)==len(relatedness_scores)
    return STSInput(sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores)

In [6]:
class STSInput(object):
    def __init__(self,sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores):
        self.sentences_A=sentences_A
        self.sentencesA_length=sentencesA_length
        self.sentences_B=sentences_B
        self.sentencesB_length=sentencesB_length
        self.relatedness_scores=relatedness_scores
    
    def sentences_A(self):
        return self.sentences_A
    
    def sentencesA_length(self):
        return self.sentencesA_length
    
    def sentences_B(self):
        return self.sentences_B
    
    def sentencesA_length(self):
        return self.sentencesB_length
    
    def relatedness_scores(self):
        return self.relatedness_scores

In [7]:
train_path=os.path.join(FLAGS.data_path,'SICK_train.txt')
valid_path=os.path.join(FLAGS.data_path,'SICK_trial.txt')
test_path=os.path.join(FLAGS.data_path,'SICK_test_annotated.txt')

dictionary,init_W=build_vocab(FLAGS.word2vec_norm)
train_data=file_to_word_ids(train_path,dictionary)
valid_data=file_to_word_ids(valid_path,dictionary)
test_data=file_to_word_ids(test_path,dictionary)

Load word2vec_norm file embeddings/word2vec_norm.txt
vocab_size=2378


In [8]:
def next_batch(start,end,input):
    inputs_A=input.sentences_A[start:end]
    inputsA_length=input.sentencesA_length[start:end]
    inputs_B=input.sentences_B[start:end]
    inputsB_length=input.sentencesB_length[start:end]
    labels=np.reshape(input.relatedness_scores[start:end],(len(range(start,end)),1))
    return inputs_A,inputsA_length,inputs_B,inputsB_length,labels

In [24]:
class Config(object):
    learning_rate=1
    max_grad_norm=5
    #num_layers=2
    #hidden_size=100
    keep_prob=1.0
    #lr_decay=0.5
    batch_size=20
    max_epoch=30
    
config=Config()
test_config=Config()
test_config.batch_size=1

In [10]:
def build_model(input_,input_length,dropout_):
    rnn_cell=tf.nn.rnn_cell.LSTMCell(num_units=50,state_is_tuple=True)
    rnn_cell=tf.nn.rnn_cell.DropoutWrapper(rnn_cell,output_keep_prob=dropout_)
    #rnn_cell=tf.nn.rnn_cell.MultiRNNCell([rnn_cell]*50,state_is_tuple=True)
        
    outputs,last_states=tf.nn.dynamic_rnn(
        cell=rnn_cell,
        dtype=data_type(),
        sequence_length=input_length,
        inputs=input_
    )
    return outputs,last_states

In [11]:
sentences_A=tf.placeholder(tf.int32,shape=([None,FLAGS.max_length]),name='sentences_A')
sentencesA_length=tf.placeholder(tf.int32,shape=([None]),name='sentencesA_length')
sentences_B=tf.placeholder(tf.int32,shape=([None,FLAGS.max_length]),name='sentences_B')
sentencesB_length=tf.placeholder(tf.int32,shape=([None]),name='sentencesB_length')
labels=tf.placeholder(tf.float32,shape=([None,1]),name='relatedness_score_label')
dropout_f=tf.placeholder(tf.float32)
W=tf.Variable(tf.constant(0.0,shape=[len(dictionary),FLAGS.embedding_dim]),trainable=False,name='W')
embedding_placeholder=tf.placeholder(data_type(),[len(dictionary),FLAGS.embedding_dim])
embedding_init=W.assign(embedding_placeholder)

sentences_A_emb=tf.nn.embedding_lookup(params=embedding_init,ids=sentences_A)
sentences_B_emb=tf.nn.embedding_lookup(params=embedding_init,ids=sentences_B)

## lstm codes
#lstm_cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=config.hidden_size,forget_bias=0.0,state_is_tuple=True)
#if is_training and config.keep_prob<1:
#    lstm_cell=tf.nn.rnn_cell.DropoutWrapper(lstm_cell,output_keep_prob=config.keep_prob)

with tf.variable_scope('siamese') as scope:
    outputs_A,last_states_A=build_model(sentences_A_emb,sentencesA_length,dropout_f)
    scope.reuse_variables()
    outputs_B,last_states_B=build_model(sentences_B_emb,sentencesB_length,dropout_f)
    
#outputs_A=tf.transpose(outputs_A,[1,0,2])
#last_A=tf.gather(outputs_A,int(outputs_A.get_shape()[0])-1)
last_A=tf.transpose(outputs_A,[1,0,2])[-1]
#outputs_B=tf.transpose(outputs_B,[1,0,2])
#last_B=tf.gather(outputs_B,int(outputs_B.get_shape()[0])-1)
last_B=tf.transpose(outputs_B,[1,0,2])[-1]
#concat_outputs=tf.concat(1,[last_A,last_B])
#fully_connected = tf.contrib.layers.fully_connected(concat_outputs,num_outputs=1,activation_fn=tf.tanh)
#prediction=4*tf.exp(-tf.abs(last_A-last_B))+1
prediction=tf.exp(tf.mul(-1.0,tf.reduce_mean(tf.abs(tf.sub(last_A,last_B)),1)))

cost=tf.reduce_mean(tf.square(tf.sub(prediction, labels)))
optimizer=tf.train.AdadeltaOptimizer(learning_rate=config.learning_rate).minimize(cost)

In [25]:
optimizer=tf.train.AdadeltaOptimizer(learning_rate=config.learning_rate).minimize(cost)

In [26]:
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    #total_batch=int(len(train_data.sentences_A)/config.batch_size)
    #print('Total batch size: {}'.format(total_batch))
    for epoch in range(3001):
        _,train_cost,train_predict=sess.run([optimizer,cost,prediction],feed_dict={
                sentences_A:train_data.sentences_A,
                sentencesA_length:train_data.sentencesA_length,
                sentences_B:train_data.sentences_B,
                sentencesB_length:train_data.sentencesB_length,
                labels:np.reshape(train_data.relatedness_scores,(len(train_data.relatedness_scores),1)),
                dropout_f:config.keep_prob,
                embedding_placeholder:init_W
            })
        if (epoch+1)%100==0:
            print('Epoch {} cost: {}'.format(epoch,train_cost))
            
    valid_cost,valid_predict=sess.run([cost,prediction],feed_dict={
        sentences_A:valid_data.sentences_A,
        sentencesA_length:valid_data.sentencesA_length,
        sentences_B:valid_data.sentences_B,
        sentencesB_length:valid_data.sentencesB_length,
        labels:np.reshape(valid_data.relatedness_scores,(len(valid_data.relatedness_scores),1)),
        embedding_placeholder:init_W,
        dropout_f:1.0
    })

Epoch 99 cost: 0.196330890059
Epoch 199 cost: 0.128718793392
Epoch 299 cost: 0.112614035606
Epoch 399 cost: 0.106197945774
Epoch 499 cost: 0.102252840996
Epoch 599 cost: 0.100890211761
Epoch 699 cost: 0.0991185232997
Epoch 799 cost: 0.0978290811181
Epoch 899 cost: 0.0967794507742
Epoch 999 cost: 0.0985529944301
Epoch 1099 cost: 0.0910010486841
Epoch 1199 cost: 0.0885689780116
Epoch 1299 cost: 0.0855084434152
Epoch 1399 cost: 0.0832052379847
Epoch 1499 cost: 0.080598577857
Epoch 1599 cost: 0.0792081356049
Epoch 1699 cost: 0.0769901350141
Epoch 1799 cost: 0.0749821662903
Epoch 1899 cost: 0.0750486701727
Epoch 1999 cost: 0.0742984414101
Epoch 2099 cost: 0.0732833296061
Epoch 2199 cost: 0.0721606165171
Epoch 2299 cost: 0.0733586326241
Epoch 2399 cost: 0.0712243914604
Epoch 2499 cost: 0.070658646524
Epoch 2599 cost: 0.0703594908118
Epoch 2699 cost: 0.0703352838755
Epoch 2799 cost: 0.0704281404614
Epoch 2899 cost: 0.0729647502303
Epoch 2999 cost: 0.0715019479394


In [27]:
print(valid_cost)

0.0744059


In [30]:
valid_predict

array([ 0.65472889,  0.62773985,  0.63553274,  0.61985922,  0.65749645,
        0.59526312,  0.60881442,  0.64743781,  0.60089278,  0.60560602,
        0.58317351,  0.60160404,  0.94305998,  0.62173247,  0.62364352,
        0.53023529,  0.72813851,  0.61526239,  0.62668103,  0.61317545,
        0.62274402,  0.97923708,  0.58127093,  0.76340103,  0.81697023,
        0.54462194,  0.88787103,  0.63298553,  0.55738062,  0.78021127,
        0.61033678,  0.67247957,  0.572411  ,  0.66396266,  0.88944554,
        0.62403977,  0.66209328,  0.57218558,  0.57088244,  0.55826885,
        0.6160937 ,  0.64982164,  0.54442036,  0.65664655,  0.48274091,
        0.64345247,  0.67189705,  0.64539301,  0.57586831,  0.67434341,
        0.71495676,  0.69430733,  0.8388167 ,  0.64249885,  0.61200064,
        0.61254108,  0.76130354,  0.63075459,  0.94995099,  0.80424619,
        0.71876734,  0.65588135,  0.83130759,  0.65730298,  0.88016921,
        0.63044465,  0.61255813,  0.93140352,  0.67743707,  0.62

In [20]:
valid_data.relatedness_scores

[0.65,
 0.6,
 0.7,
 0.475,
 0.8,
 0.4,
 0.54125,
 0.8999999999999999,
 0.9750000000000001,
 0.8,
 0.42500000000000004,
 0.575,
 0.22499999999999998,
 0.925,
 0.53375,
 0.725,
 0.7,
 0.07500000000000001,
 0.8500000000000001,
 0.8374999999999999,
 0.65,
 0.7749999999999999,
 0.75,
 0.575,
 0.575,
 0.35,
 0.525,
 0.625,
 0.625,
 0.625,
 0.575,
 0.8999999999999999,
 0.6,
 0.44625000000000004,
 0.725,
 0.725,
 0.75,
 0.525,
 1.0,
 0.8,
 0.625,
 0.525,
 0.8,
 0.7,
 0.95,
 0.275,
 0.825,
 0.525,
 0.575,
 0.0,
 0.625,
 0.5,
 0.7749999999999999,
 0.65,
 0.825,
 0.8999999999999999,
 0.65,
 0.57125,
 0.7,
 0.7749999999999999,
 1.0,
 0.125,
 0.9750000000000001,
 0.04999999999999999,
 0.65,
 0.475,
 0.09999999999999998,
 0.875,
 0.175,
 0.7,
 0.375,
 0.75,
 0.9750000000000001,
 0.9750000000000001,
 0.575,
 0.7,
 0.04999999999999999,
 0.875,
 0.925,
 0.8999999999999999,
 0.025000000000000022,
 1.0,
 0.925,
 0.025000000000000022,
 0.8999999999999999,
 0.35,
 0.35,
 0.8999999999999999,
 0.0,
 0.75,
 0