# Bowman's Model

## Import necessary packages

In [2]:
import torch
import torch.nn as nn

from torchtext import data
from torchtext.data import Dataset
from torchtext import datasets
from torchtext.vocab import GloVe

from nltk import word_tokenize
import time
import dill

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Make directories
Make directories for saving data and model

In [0]:
import os
from os.path import join

# claim directories for saving data and model
new_dir = ['./data', './model', './data/snli_split']

# if directories not exist, make new directory
for dir in new_dir:
    if not os.path.exists(dir):
        print('mkdir:', dir)
        os.mkdir(dir)

## SNLI corpus data preprocessing
The class to preprocess SNLI corpus.

For the first time on initial, the initial function will download and split SNLI corpus to train, dev and test sets, build vocab for the corpus by using Glove 840B 300d and store them to local files. And it will also generate batch iterator for the sets. This process may take more than 5 minutes on first time.

After the first time, initial function will check existence of local files, if exists, it will load directly from local file and thus can save a lot of time (within 1 minute).

In [0]:
class SNLIDataset(Dataset):
    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(len(ex.premise), len(ex.hypothesis))

# preprocess SNLI corpus to save time and give train, dev, test sets
class SNLI(object):
    def __init__(self, batch_size=4, gpu=torch.device('cuda')):
        # set file name for train dev test sets
        self.snli_split_path_lst = ['./data/snli_split/train', './data/snli_split/dev', './data/snli_split/test']

        # set data field for text and label
        self.TEXT = data.Field(batch_first=True, include_lengths=True, tokenize=word_tokenize, lower=True)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        # split corpus
        if self.if_splited():
            # if already splited, load local sets
            fields = {'premise': self.TEXT, 'hypothesis': self.TEXT, 'label': self.LABEL}
            self.train, self.dev, self.test = self.load_split_datasets(fields)
        else:
            # split corpus to train, dev, test sets and save them to local
            self.train, self.dev, self.test = datasets.SNLI.splits(self.TEXT, self.LABEL, root='data')
            self.save_splited_sets(self.train, self.dev, self.test)


        # build vocab for corpus
        if os.path.exists('./data/snli_split/text_vocab') and os.path.exists('./data/snli_split/label_vocab'):
            # if local vocab exists, load local vocab into model
            with open('./data/snli_split/text_vocab', 'rb')as f:
                self.TEXT.vocab = dill.load(f)
            with open('./data/snli_split/label_vocab', 'rb')as f:
                self.LABEL.vocab = dill.load(f)
        else:
            # build vocab for corpus and save it to local
            self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300))
            self.LABEL.build_vocab(self.train)
            with open('./data/snli_split/text_vocab', 'wb')as f:
                dill.dump(self.TEXT.vocab, f)
            with open('./data/snli_split/label_vocab', 'wb')as f:
                dill.dump(self.LABEL.vocab, f)


        # generate batch iterator
        self.train_iter, self.dev_iter, self.test_iter =  data.BucketIterator.splits((self.train, self.dev, self.test), batch_size=batch_size, device=gpu)

    # check local train, dev, test sets
    def if_splited(self):
        for path in self.snli_split_path_lst:
            if not os.path.exists(path):
                return False
        return True

    # load dataset from local
    def load_split_datasets(self, fields):
        # load from local
        with open('./data/snli_split/train', 'rb')as f:
            train_examples = dill.load(f)
        with open('./data/snli_split/dev', 'rb')as f:
            dev_examples = dill.load(f)
        with open('./data/snli_split/test', 'rb')as f:
            test_examples = dill.load(f)

        # recover
        train = SNLIDataset(examples=train_examples, fields=fields)
        dev = SNLIDataset(examples=dev_examples, fields=fields)
        test = SNLIDataset(examples=test_examples, fields=fields)
        return train, dev, test

    # save datasets to local
    def save_splited_sets(self, train, dev, test):
        # save to local
        with open('./data/snli_split/train', 'wb')as f:
            dill.dump(train.examples, f)
        with open('./data/snli_split/dev', 'wb')as f:
            dill.dump(dev.examples, f)
        with open('./data/snli_split/test', 'wb')as f:
            dill.dump(test.examples, f)

## Initialize SNLI class and do preprocessing

In [0]:
device = torch.device('cuda')
snli = SNLI(batch_size=32, gpu=device)

## Bowman's Model

In [0]:
class Bowman(nn.Module):
    def __init__(self, vocab, premise_emb=300, hypothesis_emb=300, premise_d=100, hypothesis_d=100, lstm_layers=1, dropout=0.1):
        super(Bowman, self).__init__()
        # vocab - vocab built for corpus
        # premise_emb - word embedding size for tokens in premise
        # hypothesis_emb - word embedding size for tokens in hypothesis
        # premise_d - sentence embedding size for premise
        # hypothesis_d - sentence embedding size for hypothesis
        # lstm_layers - layer number for LSTM model
        # dropout - dropout rate for the model
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors)
        self.dropout = nn.Dropout(dropout)
        self.Premise_Enc = nn.LSTM(input_size=premise_emb, hidden_size=premise_d, num_layers=lstm_layers, batch_first=True)
        self.Hypothesis_Enc = nn.LSTM(input_size=hypothesis_emb, hidden_size=hypothesis_d, num_layers=lstm_layers, batch_first=True)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(premise_d + hypothesis_d, 3) # batch_size x 3

    def forward(self, premise_seq, hypothesis_seq):
        premise_seq = self.embedding(premise_seq) # batch_size x seq_len -> batch_size x seq_len x 300
        hypothesis_seq = self.embedding(hypothesis_seq) # batch_size x seq_len -> batch_size x seq_len x 300
        premise_seq = self.dropout(premise_seq)
        hypothesis_seq = self.dropout(hypothesis_seq)

        premise_output, _  = self.Premise_Enc(premise_seq) # batch_size x seq_len x 300 -> batch_size x seq_len x 100
        hypothesis_output, _  = self.Hypothesis_Enc(hypothesis_seq) # batch_size x seq_len x 300 -> batch_size x seq_len x 100
        premise_output = torch.mean(premise_output, 1) # batch_size x seq_len x 100 -> batch_size x 100
        hypothesis_output = torch.mean(hypothesis_output, 1) # batch_size x seq_len x 100 -> batch_size x 100
        next_in = torch.cat((premise_output, hypothesis_output), 1)  # [batch_size x 100, batch_size x 100] -> batch_size x 200
        #next_in = torch.cat((premise_output[ :, -1, :],hypothesis_output[ :, -1, :]), 1)
        next_in = self.dropout(next_in)
        tanh_out = self.tanh(self.tanh(self.tanh(next_in)))
        output = self.out(tanh_out) # batch_size x 200 -> batch_size x 3
        return output

## Train the model

In [0]:
def bowman_train(model, dataset, criterion, optimizer, epoch_num=5):
    # model - model
    # dataset - traning set
    # criterion - loss function
    # optimizer - optimize function
    # epoch_num - epoch number
    snli = dataset
    # file to record average loss for each epoch
    record = open("result.txt", "wb", buffering=0)
    # switch to train mode
    model.train()
    for epoch in range(epoch_num):
        running_loss = 0.0
        epoch_loss = 0.0
        i = 0
        for batch in snli.train_iter:
            i += 1
            # get data
            premise, _ = batch.premise
            hypothesis, _ = batch.hypothesis
            label = batch.label

            # zeros the parameters gradients
            optimizer.zero_grad()

            # forward + backward + optimize step
            output = model(premise, hypothesis)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            # add loss for the batch
            running_loss += loss.item()
            epoch_loss += loss.item()
            if i % 1000 == 999:
                # print average running loss for each 1000 batch
                print('[%d, %5d] loss: %.3f' % (epoch, i + 1, running_loss / 1000))
                running_loss = 0.0
        # print average loss for the epoch
        print('epoch %d loss: %.3f\n' % (epoch, epoch_loss / (i + 1)))
        # save average loss for the epoch
        record.write(b'%f\n' % (epoch_loss / (i + 1)))
        # save trained model after the epoch
        torch.save(model, './model/bowman_%d.pth'% (epoch))

    # save final trained model
    torch.save(model, './model/bowman_final.pth')

Initial model, use cross entropy loss and Adam Delta SGD as optimize function

In [16]:
model = Bowman(snli.TEXT.vocab)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.01)

bowman_train(model, snli, criterion, optimizer, epoch_num=1000)

[0,  1000] loss: 1.085
[0,  2000] loss: 1.029
[0,  3000] loss: 0.975
[0,  4000] loss: 0.948
[0,  5000] loss: 0.927
[0,  6000] loss: 0.916
[0,  7000] loss: 0.910
[0,  8000] loss: 0.898
[0,  9000] loss: 0.898
[0, 10000] loss: 0.884
[0, 11000] loss: 0.877
[0, 12000] loss: 0.879
[0, 13000] loss: 0.869
[0, 14000] loss: 0.860
[0, 15000] loss: 0.856
[0, 16000] loss: 0.853
[0, 17000] loss: 0.850
epoch 0 loss: 0.912



  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


## Evaluation

Do evaluation on both training set and test set, then print the predict accuracy on each set

In [0]:
def bowman_eval(model, dataset):
    # model - model
    # dataset - evaluation set
    snli = dataset

    # switch to evaluation mode
    model.eval()

    c_count = 0.
    t_count = 0.
    for batch in snli.train_iter:
        # get data
        premise, _ = batch.premise
        hypothesis, _ = batch.hypothesis
        label = batch.label

        # do predict
        output = model(premise, hypothesis)
        predict = torch.argmax(output, dim=1)

        batch_size = predict.shape
        # total number
        t_count += batch_size[0]
        # correct number
        c_count += int(torch.sum(predict == label))
    # calcualte the accuracy and print it out
    print("Train acc.: %f" % (c_count / t_count))
    
    c_count = 0.
    t_count = 0.
    for batch in snli.test_iter:
        premise, _ = batch.premise
        hypothesis, _ = batch.hypothesis
        label = batch.label
        output = model(premise, hypothesis)
        predict = torch.argmax(output, dim=1)
        batch_size = predict.shape
        t_count += batch_size[0]
        c_count += int(torch.sum(predict == label))
    print("Test acc.: %f" % (c_count / t_count))

In [18]:
#model = torch.load("./model/bowman_final.pt")
bowman_eval(model, snli)

Train acc.: 0.619719
Test acc.: 0.581230
