In [6]:
import torch
import torch.nn as nn

from torchtext import data
from torchtext.data import Dataset
from torchtext import datasets
from torchtext.vocab import GloVe

from nltk import word_tokenize
import time
import dill

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
import os
from os.path import join

new_dir = []
project_dir_path = './'
data_dir_path = join(project_dir_path, 'data')
model_dir_path = join(project_dir_path, 'model')
new_dir.append(data_dir_path)
new_dir.append(model_dir_path)
snli_split_dir_path = join(data_dir_path, 'snli_split')
snli_train_examples_path = join(snli_split_dir_path, 'train_examples')
snli_dev_examples_path = join(snli_split_dir_path, 'dev_examples')
snli_test_examples_path = join(snli_split_dir_path, 'test_examples')

snli_split_path_lst = [snli_train_examples_path, snli_dev_examples_path, snli_test_examples_path]

snli_text_vocab_path = join(snli_split_dir_path, 'text_vocab')
snli_label_vocab_path = join(snli_split_dir_path, 'label_vocab')

new_dir.append(snli_split_dir_path)

for dir in new_dir:
    if not os.path.exists(dir):
        print('mkdir:', dir)
        os.mkdir(dir)

In [0]:
class SNLIDataset(Dataset):
    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(
            len(ex.premise), len(ex.hypothesis))


class SNLI(object):
    def __init__(self, batch_size=4, gpu=torch.device(torch.cuda.current_device())):
        self.TEXT = data.Field(batch_first=True,
                               include_lengths=True,
                               tokenize=word_tokenize,
                               lower=True)

        self.LABEL = data.Field(sequential=False, unk_token=None)

        # Split Dataset
        if self.if_split_already():
            print('Loading splited data set...')
            fields = {'premise': self.TEXT, 'hypothesis': self.TEXT, 'label': self.LABEL}
            self.train, self.dev, self.test = self.load_split_datasets(fields)
        else:
            print('No local data set detected, spliting...')
            self.train, self.dev, self.test = datasets.SNLI.splits(self.TEXT, self.LABEL, root='data')
            self.dump_examples(self.train, self.dev, self.test)


        # Create Vocab
        print('Building Vocab...')
        # self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300))
        # self.LABEL.build_vocab(self.train)
        if os.path.exists(snli_text_vocab_path) and os.path.exists(snli_label_vocab_path):
            print('Loading local Vocab...')
            with open(snli_text_vocab_path, 'rb')as f:
                self.TEXT.vocab = dill.load(f)
            with open(snli_label_vocab_path, 'rb')as f:
                self.LABEL.vocab = dill.load(f)
        else:
            print('No local Vocab detected, building...')
            self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300))
            self.LABEL.build_vocab(self.train)
            with open(snli_text_vocab_path, 'wb')as f:
                dill.dump(self.TEXT.vocab, f)
            with open(snli_label_vocab_path, 'wb')as f:
                dill.dump(self.LABEL.vocab, f)


        # Generate batch iterator
        print('Generating batch iter...')
        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_size=batch_size,
                                       device=gpu)

    def if_split_already(self):
        for path in snli_split_path_lst:
            if not os.path.exists(path):
                return False
        return True

    # Load dataset from local
    def load_split_datasets(self, fields):
        # Loading examples
        with open(snli_train_examples_path, 'rb')as f:
            train_examples = dill.load(f)
        with open(snli_dev_examples_path, 'rb')as f:
            dev_examples = dill.load(f)
        with open(snli_test_examples_path, 'rb')as f:
            test_examples = dill.load(f)

        # Recover dataset
        train = SNLIDataset(examples=train_examples, fields=fields)
        dev = SNLIDataset(examples=dev_examples, fields=fields)
        test = SNLIDataset(examples=test_examples, fields=fields)
        return train, dev, test

    # Save to local
    def dump_examples(self, train, dev, test):
        # Save examples
        if not os.path.exists(snli_train_examples_path):
            with open(snli_train_examples_path, 'wb')as f:
                dill.dump(train.examples, f)
        if not os.path.exists(snli_dev_examples_path):
            with open(snli_dev_examples_path, 'wb')as f:
                dill.dump(dev.examples, f)
        if not os.path.exists(snli_test_examples_path):
            with open(snli_test_examples_path, 'wb')as f:
                dill.dump(test.examples, f)

In [19]:
device = torch.device('cuda')
snli = SNLI(batch_size=32, gpu=device)

Loading splited data set...
Building Vocab...
Loading local Vocab...
Generating batch iter...


In [0]:
class Bowman(nn.Module):
    def __init__(self, vocab, premise_emb=300, hypothesis_emb=300, premise_d=100, hypothesis_d=100, lstm_layers=1, dropout=0.1):
        super(Bowman, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(vocab.vectors)
        self.dropout = nn.Dropout(dropout)
        self.Premise_Enc = nn.LSTM(input_size=premise_emb, hidden_size=premise_d, num_layers=lstm_layers, batch_first=True)
        self.Hypothesis_Enc = nn.LSTM(input_size=hypothesis_emb, hidden_size=hypothesis_d, num_layers=lstm_layers, batch_first=True)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(premise_d + hypothesis_d, 3)

    def forward(self, premise_seq, hypothesis_seq):
        premise_seq = self.embedding(premise_seq)
        hypothesis_seq = self.embedding(hypothesis_seq)
        premise_seq = self.dropout(premise_seq)
        hypothesis_seq = self.dropout(hypothesis_seq)

        premise_output, _  = self.Premise_Enc(premise_seq)
        hypothesis_output, _  = self.Hypothesis_Enc(hypothesis_seq)
        premise_output = torch.mean(premise_output, 1)
        hypothesis_output = torch.mean(hypothesis_output, 1)
        next_in = torch.cat((premise_output, hypothesis_output), 1)
        #next_in = torch.cat((premise_output[ :, -1, :],hypothesis_output[ :, -1, :]), 1)
        next_in = self.dropout(next_in)
        tanh_out = self.tanh(self.tanh(self.tanh(next_in)))
        output = self.out(tanh_out)
        return output

In [0]:
def bowman_train(model, dataset, criterion, optimizer, epoch_num=5):
    snli = dataset
    record = open("result.txt", "w")

    for epoch in range(epoch_num):

        running_loss = 0.0
        epoch_loss = 0.0
        i = 0
        for batch in snli.train_iter:
            i += 1
            #get data
        
            premise, _ = batch.premise
            premise.to(device)
            hypothesis, _ = batch.hypothesis
            hypothesis.to(device)
            label = batch.label
            label.to(device)

            # zeros the paramster gradients
            optimizer.zero_grad()       # 

            # forward + backward + optimize
            output = model(premise, hypothesis)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step() 

            # print statistics
            running_loss += loss.item() 
            epoch_loss = loss.item()
            if i % 1000 == 999:
                print('[%d, %5d] loss: %.3f' % (epoch, i + 1, running_loss / 1000))
                running_loss = 0.0
        print('epoch %d loss: %.3f\n' % (epoch, epoch_loss))
        record.write('epoch %d loss: %.3f\n' % (epoch, epoch_loss))
        torch.save(model, './model/bowman_%d.pt'% (epoch))

    print('Finished Training')
    torch.save(model, './model/bowman_final.pt')

In [0]:
model = Bowman(snli.TEXT.vocab)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.1)

bowman_train(model, snli, criterion, optimizer, epoch_num=10)

[0,  1000] loss: 1.087
[0,  2000] loss: 1.041
[0,  3000] loss: 0.987
[0,  4000] loss: 0.946
[0,  5000] loss: 0.927
[0,  6000] loss: 0.920
[0,  7000] loss: 0.905
[0,  8000] loss: 0.895
[0,  9000] loss: 0.892
[0, 10000] loss: 0.889
[0, 11000] loss: 0.884
[0, 12000] loss: 0.876
[0, 13000] loss: 0.861
[0, 14000] loss: 0.864
[0, 15000] loss: 0.865
[0, 16000] loss: 0.856
[0, 17000] loss: 0.846
epoch 0 loss: 1.059



  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


[1,  1000] loss: 0.844
[1,  2000] loss: 0.837
[1,  3000] loss: 0.839
[1,  4000] loss: 0.829
[1,  5000] loss: 0.837
[1,  6000] loss: 0.832
[1,  7000] loss: 0.827
[1,  8000] loss: 0.823
[1,  9000] loss: 0.822
[1, 10000] loss: 0.817
[1, 11000] loss: 0.814
[1, 12000] loss: 0.813
[1, 13000] loss: 0.818
[1, 14000] loss: 0.816
[1, 15000] loss: 0.813
[1, 16000] loss: 0.810
[1, 17000] loss: 0.808
epoch 1 loss: 0.817

[2,  1000] loss: 0.796
[2,  2000] loss: 0.809
[2,  3000] loss: 0.800
[2,  4000] loss: 0.799
[2,  5000] loss: 0.803
[2,  6000] loss: 0.801
[2,  7000] loss: 0.799
[2,  8000] loss: 0.793
[2,  9000] loss: 0.797
[2, 10000] loss: 0.792
[2, 11000] loss: 0.795
[2, 12000] loss: 0.787
[2, 13000] loss: 0.791
