In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import time
import os

In [3]:
# Model Hyperparameters
flags=tf.flags

flags.DEFINE_string('word2vec', 'embeddings/GoogleNews-vectors-negative300.bin', 'Word2vec file with pre-trained embeddings')
flags.DEFINE_string('data_path','SICK','SICK data set path')
flags.DEFINE_integer('embedding_dim', 300, 'Dimensionality of word embedding')

FLAGS=flags.FLAGS
FLAGS._parse_flags()
print('Parameters:')
for attr,value in sorted(FLAGS.__flags.items()):
    print('{}={}'.format(attr,value))

Parameters:
data_path=SICK
embedding_dim=300
word2vec=embeddings/GoogleNews-vectors-negative300.bin


In [4]:
def build_vocab(word2vec_path=None):
    if word2vec_path:
        print('Load word2vec file {}'.format(word2vec_path))
        with open(FLAGS.word2vec,'rb') as f:
            header=f.readline()
            vocab_size,layer2_size=map(int,header.split())
            # initial matrix with random uniform
            init_W=np.random.uniform(-0.25,0.25,(vocab_size,FLAGS.embedding_dim))

            binary_len=np.dtype('float32').itemsize*FLAGS.embedding_dim
            print('vocab_size={}'.format(vocab_size))
            dictionary=dict()
            for line in xrange(vocab_size):
                word=[]
                while True:
                    ch=f.read(1)
                    if ch==' ':
                        word=''.join(word)
                        break
                    if ch!='\n':
                        word.append(ch)
                word=word.decode('utf-8')
                dictionary[word]=len(dictionary)
                init_W[dictionary[word]]=np.fromstring(f.read(binary_len),dtype='float32')

            #reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
        #return dictionary,reverse_dictionary,init_W
        return dictionary,init_W

In [46]:
def file_to_word_ids(filename,word_to_id):
    with open(filename,'r') as f:
        f.readline() # remove header
        sentences_A=[]
        sentencesA_length=[]
        sentences_B=[]
        sentencesB_length=[]
        relatedness_scores=[]
        while True:
            line=f.readline()
            if not line: break
            pair_ID=line.split('\t')[0] # for trial & test
            sentence_A=line.split('\t')[1]
            sentence_B=line.split('\t')[2]
            relatedness_score=line.split('\t')[3]
            sentences_A.append([word_to_id[word] for word in sentence_A.split() if word in word_to_id])
            sentencesA_length.append(len(sentence_A.split()))
            sentences_B.append([word_to_id[word] for word in sentence_B.split() if word in word_to_id])
            sentencesB_length.append(len(sentence_B.split()))
            relatedness_scores.append(relatedness_score)
    assert len(sentences_A)==len(sentencesA_length)==len(sentences_B)==len(sentencesB_length)==len(relatedness_scores)
    return STSInput(sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores)

In [45]:
class STSInput(object):
    def __init__(self,sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores):
        self.sentences_A=sentences_A
        self.sentencesA_length=sentencesA_length
        self.sentences_B=sentences_B
        self.sentencesB_length=sentencesB_length
        self.relatedness_scores=relatedness_scores
    
    def sentences_A(self):
        return self.sentences_A
    
    def sentencesA_length(self):
        return self.sentencesA_length
    
    def sentences_B(self):
        return self.sentences_B
    
    def sentencesA_length(self):
        return self.sentencesB_length
    
    def relatedness_scores(self):
        return self.relatedness_scores

In [13]:
sentences_A,sentencesA_length,sentences_B,sentencesB_length,relatedness_scores=file_to_word_ids(train_path,dictionary)

In [47]:
train_path=os.path.join(FLAGS.data_path,'SICK_train.txt')
valid_path=os.path.join(FLAGS.data_path,'SICK_trial.txt')
test_path=os.path.join(FLAGS.data_path,'SICK_test.txt')

dictionary,init_W=build_vocab(FLAGS.word2vec)
train_data=file_to_word_ids(train_path,dictionary)
valid_data=file_to_word_ids(valid_path,dictionary)
test_data=file_to_word_ids(test_path,dictionary)

In [49]:
def next_batch(start,end,input):
    inputs_A=input.sentences_A[start:end]
    inputsA_length=input.sentencesA_length[start:end]
    inputs_B=input.sentences_B[start:end]
    inputsB_length=input.sentencesB_length[start:end]
    labels=np.reshape(input.relatedness_scores[start:end],(len(range(start,end)),1))
    return inputs_A,inputsA_length,inputs_B,inputsB_length,labels

In [30]:
class TrainConfig(object):
    init_scale=0.01
    learning_rate=0.01
    max_grad_norm=5
    num_layers=2
    keep_prob=1.0
    lr_decay=0.5
    batch_size=20
    
class TestConfig(object):
    init_sclae=0.1
    learning_rate=0.01
    max_grad_norm=5
    num_layers=2
    keep_prob=1.0
    lr_decay=0.5
    batch_size=20

In [None]:
class STSModel(object):
    def __init__(self,is_training,config):