In [1]:
import logging

logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p',
                    level=logging.DEBUG)
logger = logging.getLogger(__name__)

import pandas as pd

In [58]:
import pickle

In [2]:
from gensim.models import Word2Vec

In [3]:
print('loading snli data ...')
train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t')

test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t')

dev_df = pd.read_csv('./snli/snli_1.0/snli_1.0_dev.txt', delimiter='\t')

loading snli data ...


In [4]:
word2vec = Word2Vec.load_word2vec_format('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
len(word2vec.vocab)

3000000

In [14]:
word2vec.vocab['hello']

<gensim.models.word2vec.Vocab at 0x7ff105f0e908>

In [5]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer

In [6]:
hw = 'Hello, World!'

In [7]:
word_tokenize(hw)

['Hello', ',', 'World', '!']

In [8]:
re_tokenize = RegexpTokenizer('\w+')

In [9]:
re_tokenize.tokenize(hw)

['Hello', 'World']

#### first tokenize premise and hypothesis, not just plain split, because of punctuation

In [19]:
from collections import OrderedDict

In [23]:
inv_words, oov_words_in_train = OrderedDict(), set()

In [24]:
def check_sent(s):
    # s = s.str.translate(punctuation)
    count = 0
    for r in s:
        if type(r) != str:
            print(r)
            count += 1
            continue
        # words = re_tokenize.tokenize(r)
        words = word_tokenize(r)
        for w in words:
            if w in inv_words or w in oov_words_in_train:
                continue
            if w not in word2vec:
                count += 1
                oov_words_in_train.add(w)
            else:
                inv_words[w] = word2vec.vocab[w].index
    return count

In [25]:
train_df[['sentence1', 'sentence2']].apply(check_sent)

nan
nan
nan


sentence1    3116
sentence2    6053
dtype: int64

In [26]:
len(inv_words), len(oov_words_in_train)

(33509, 9166)

In [27]:
oov_words_not_train = set()

In [28]:
def check_sent_dev_test(s):
    # s = s.str.translate(punctuation)
    count = 0
    for r in s:
        if type(r) != str:
            print(r)
            count += 1
            continue
        # words = re_tokenize.tokenize(r)
        words = word_tokenize(r)
        for w in words:
            if w in inv_words or w in oov_words_in_train or w in oov_words_not_train:
                continue
            if w not in word2vec:
                count += 1
                oov_words_not_train.add(w)
            else:
                inv_words[w] = word2vec.vocab[w].index
    return count

In [22]:
dev_test_df = pd.concat([dev_df, test_df], ignore_index=True)
dev_test_df.shape

(20000, 14)

In [29]:
dev_test_df[['sentence1', 'sentence2']].apply(check_sent_dev_test)

sentence1    134
sentence2    161
dtype: int64

In [30]:
len(inv_words), len(oov_words_not_train)

(33988, 295)

#### constructs words to ids dict

In [52]:
index = 0
dictionary = OrderedDict()

In [53]:
for k in inv_words:
    dictionary[k] = index
    index += 1

In [54]:
for k in oov_words_not_train:
    dictionary[k] = index
    index += 1

In [55]:
index

34283

In [56]:
for k in oov_words_in_train:
    dictionary[k] = index
    index += 1

In [57]:
index

43449

In [61]:
dictionary_filename = './snli/dictionary.pkl'
with open(dictionary_filename, 'wb') as f:
    pickle.dump(dictionary, f)

#### constructs words enbedding W

In [31]:
inv_words

OrderedDict([('A', 73),
             ('person', 571),
             ('on', 5),
             ('horse', 3267),
             ('jumps', 10147),
             ('over', 63),
             ('broken', 2197),
             ('down', 119),
             ('airplane', 8792),
             ('Children', 2875),
             ('smiling', 8090),
             ('waving', 11553),
             ('at', 12),
             ('camera', 2652),
             ('boy', 1556),
             ('is', 4),
             ('jumping', 6345),
             ('skateboard', 34771),
             ('in', 1),
             ('the', 11),
             ('middle', 1221),
             ('red', 1618),
             ('bridge', 2683),
             ('An', 741),
             ('older', 1738),
             ('man', 251),
             ('sits', 5305),
             ('with', 8),
             ('his', 26),
             ('orange', 7442),
             ('juice', 12265),
             ('small', 428),
             ('table', 1757),
             ('coffee', 4126),
             

In [34]:
list(inv_words.keys())[:10]

['A',
 'person',
 'on',
 'horse',
 'jumps',
 'over',
 'broken',
 'down',
 'airplane',
 'Children']

In [35]:
list(inv_words.values())[:10]

[73, 571, 5, 3267, 10147, 63, 2197, 119, 8792, 2875]

In [36]:
inv_indices = list(inv_words.values())

In [37]:
inv_W = word2vec.syn0[inv_indices]

In [38]:
inv_W.shape

(33988, 300)

In [39]:
import numpy as np

In [44]:
rsg = np.random.RandomState(919)

In [46]:
oov_not_train_W = (rsg.rand(len(oov_words_not_train), word2vec.vector_size) - 0.5) / 10.0
oov_not_train_W.shape

(295, 300)

In [48]:
unchanged_W = np.concatenate([inv_W, oov_not_train_W])
unchanged_W.shape

(34283, 300)

In [49]:
oov_in_train_W = (rsg.rand(len(oov_words_in_train), word2vec.vector_size) - 0.5) / 10.0
oov_in_train_W.shape

(9166, 300)

##### check equal

In [69]:
np.all([np.all(word2vec.syn0[i2] == unchanged_W[i1]) for i1, i2 in enumerate(inv_indices)])

True

In [70]:
unchanged_W_filename = './snli/unchanged_W.pkl'
with open(unchanged_W_filename, 'wb') as f:
    pickle.dump(unchanged_W, f)

In [71]:
oov_in_train_W_filename = './snli/oov_in_train_W.pkl'
with open(oov_in_train_W_filename, 'wb') as f:
    pickle.dump(oov_in_train_W, f)

#### convert sentence to list of words id

In [73]:
def to_ids(r):
    premise_words = word_tokenize(r.sentence1)
    hypo_words = word_tokenize(r.sentence2)
    premise_ids = []
    for w in premise_words:
        premise_ids.append(dictionary[w])
    hypo_ids = []
    for w in hypo_words:
        hypo_ids.append(dictionary[w])
    r.loc['sentence1'] = premise_ids
    r.loc['sentence2'] = hypo_ids
    return r

In [75]:
train_df = train_df.fillna('')

In [76]:
converted_train = train_df.apply(to_ids, axis=1)

In [77]:
dev_df = dev_df.fillna('')

In [78]:
converted_dev = dev_df.apply(to_ids, axis=1)

In [79]:
test_df = test_df.fillna('')

In [81]:
converted_test = test_df.apply(to_ids, axis=1)

In [82]:
converted_train.columns

Index(['gold_label', 'sentence1_binary_parse', 'sentence2_binary_parse',
       'sentence1_parse', 'sentence2_parse', 'sentence1', 'sentence2',
       'captionID', 'pairID', 'label1', 'label2', 'label3', 'label4',
       'label5'],
      dtype='object')

In [83]:
saved_columns = ['sentence1', 'sentence2', 'gold_label']

In [84]:
converted_train = converted_train[saved_columns]
converted_dev = converted_dev[saved_columns]
converted_test = converted_test[saved_columns]

In [85]:
converted_train.columns

Index(['sentence1', 'sentence2', 'gold_label'], dtype='object')

In [86]:
converted_train_filename = './snli/converted_train.pkl'
with open(converted_train_filename, 'wb') as f:
    pickle.dump(converted_train, f)

In [87]:
converted_dev_filename = './snli/converted_dev.pkl'
with open(converted_dev_filename, 'wb') as f:
    pickle.dump(converted_dev, f)

In [88]:
converted_test_filename = './snli/converted_test.pkl'
with open(converted_test_filename, 'wb') as f:
    pickle.dump(converted_test, f)

In [89]:
converted_train.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"[0, 1, 2, 39993, 3, 4, 5, 39993, 6, 7, 8, 40257]","[0, 1, 15, 3186, 28, 3, 147, 39993, 1457, 40257]",neutral
1,"[0, 1, 2, 39993, 3, 4, 5, 39993, 6, 7, 8, 40257]","[0, 1, 15, 12, 39993, 2153, 35516, 3303, 81, 1...",contradiction
2,"[0, 1, 2, 39993, 3, 4, 5, 39993, 6, 7, 8, 40257]","[0, 1, 15, 337, 35516, 2, 39993, 3, 40257]",entailment
3,"[9, 10, 38944, 11, 12, 13]","[4802, 45, 10, 12, 124, 1305]",neutral
4,"[9, 10, 38944, 11, 12, 13]","[301, 45, 304, 4511]",entailment


In [90]:
converted_dev.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"[42, 44, 45, 1074, 35, 67, 35350, 985, 6934, 4...","[84, 3397, 45, 46, 66, 35, 67, 35350, 985, 693...",neutral
1,"[42, 44, 45, 1074, 35, 67, 35350, 985, 6934, 4...","[42, 107, 45, 67, 6934, 40257]",entailment
2,"[42, 44, 45, 1074, 35, 67, 35350, 985, 6934, 4...","[84, 359, 45, 1499, 102, 39993, 6530, 40257]",contradiction
3,"[42, 125, 304, 18, 269, 1221, 35516, 47, 27, 1...","[42, 479, 18, 2619, 1221, 9645, 124, 230, 40257]",entailment
4,"[42, 125, 304, 18, 269, 1221, 35516, 47, 27, 1...","[42, 479, 12, 39993, 4535, 9645, 124, 230, 40257]",neutral


In [91]:
converted_test.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"[1349, 239, 222, 1259, 35350, 19, 10778, 274, ...","[84, 239, 472, 12740, 18, 19, 3053, 40257]",neutral
1,"[1349, 239, 222, 1259, 35350, 19, 10778, 274, ...","[84, 239, 15, 248, 27, 3087, 40257]",entailment
2,"[1349, 239, 222, 1259, 35350, 19, 10778, 274, ...","[0, 222, 1135, 12, 39993, 761, 83, 40257]",contradiction
3,"[0, 107, 27, 39993, 132, 268, 35516, 269, 56, ...","[84, 107, 15, 125, 40257]",neutral
4,"[0, 107, 27, 39993, 132, 268, 35516, 269, 56, ...","[84, 107, 15, 1081, 1828, 40257]",entailment
