In [1]:
##
import sys
!{sys.executable} -m pip install sentence_transformers

Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
Installing collected packages: sentencepiece, sentence_transformers
Successfully installed sentence_transformers-2.2.2 sentencepiece-0.1.97




In [113]:
##
from sentence_transformers import SentenceTransformer
model_st = SentenceTransformer('all-mpnet-base-v2')

In [3]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

import inspect
import os
from random import randint
import time

import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import re
from nltk.tokenize import word_tokenize

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Keep it on CPU or put it on GPU
use_cuda = torch.cuda.is_available()
#or force not to use cuda
#use_cuda = False
model = model.cuda() if use_cuda else model

In [126]:
class NLINet(nn.Module):
    def __init__(self, config):
        super(NLINet, self).__init__()

        # classifier
        self.nonlinear_fc = config['nonlinear_fc']
        self.fc_dim = config['fc_dim']
        self.n_classes = config['n_classes']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.encoder_type = config['encoder_type']
        self.dpout_fc = config['dpout_fc']

        self.encoder = model_st  #eval(self.encoder_type)(config)
        self.inputdim = 4*self.enc_lstm_dim  ##4*2*self.enc_lstm_dim
        self.inputdim = self.inputdim if self.encoder_type in \
                        ["ConvNetEncoder", "InnerAttentionMILAEncoder"] else self.inputdim
        self.inputdim = self.inputdim/2 if self.encoder_type == "LSTMEncoder" \
                                        else self.inputdim
        if self.nonlinear_fc:
            self.classifier = nn.Sequential(
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.inputdim, self.fc_dim),
                nn.Tanh(),
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.fc_dim, self.fc_dim),
                nn.Tanh(),
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.fc_dim, self.n_classes),
                )
        else:
            self.classifier = nn.Sequential(
                nn.Linear(self.inputdim, self.fc_dim),
                nn.Linear(self.fc_dim, self.fc_dim),
                nn.Linear(self.fc_dim, self.n_classes)
                )
        #print(nn.Linear(self.inputdim, self.fc_dim).weight.dtype)

    def forward(self, u, v):
        # s1 : (s1, s1_len)
        #u = torch.tensor(self.encoder.encode(s1)).cuda()
        #v = torch.tensor(self.encoder.encode(s2)).cuda()
        #print(u)

        ##s1 and s2 become u and v

        features = torch.cat((u, v, torch.abs(u-v), u*v), 1)
        output = self.classifier(features)
        return output

    def encode(self, s1):
        emb = self.encoder(s1)
        return emb


In [56]:
##
tmp1 = pd.read_csv('../dataset/esnli_train_1.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
tmp2 = pd.read_csv('../dataset/esnli_train_2.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
train = pd.concat([tmp1, tmp2], ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549367 entries, 0 to 549366
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   gold_label  549367 non-null  object
 1   Sentence1   549367 non-null  object
 2   Sentence2   549361 non-null  object
dtypes: object(3)
memory usage: 12.6+ MB


In [57]:
##
valid = pd.read_csv('../dataset/esnli_dev.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9842 entries, 0 to 9841
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  9842 non-null   object
 1   Sentence1   9842 non-null   object
 2   Sentence2   9842 non-null   object
dtypes: object(3)
memory usage: 230.8+ KB


In [58]:
##
test = pd.read_csv('../dataset/esnli_test.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9824 entries, 0 to 9823
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  9824 non-null   object
 1   Sentence1   9824 non-null   object
 2   Sentence2   9824 non-null   object
dtypes: object(3)
memory usage: 230.4+ KB


In [59]:
#map label to int
label_to_int = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

In [60]:
##
#add label int
train['label'] = train['gold_label'].apply(lambda x: label_to_int[x])
valid['label'] = valid['gold_label'].apply(lambda x: label_to_int[x])
test['label'] = test['gold_label'].apply(lambda x: label_to_int[x])

In [127]:
parser = argparse.ArgumentParser(description='NLI training')
# paths
parser.add_argument("--nlipath", type=str, default='dataset/SNLI/', help="NLI data path (SNLI or MultiNLI)")
parser.add_argument("--outputdir", type=str, default='../savedir/', help="Output directory")
parser.add_argument("--outputmodelname", type=str, default='model.pickle')
parser.add_argument("--word_emb_path", type=str, default="../dataset/GloVe/glove.840B.300d.txt", help="word embedding file path")

# training
parser.add_argument("--n_epochs", type=int, default=40)
parser.add_argument("--batch_size", type=int, default=128)  #64)
parser.add_argument("--dpout_model", type=float, default=0., help="encoder dropout")
parser.add_argument("--dpout_fc", type=float, default=0., help="classifier dropout")
parser.add_argument("--nonlinear_fc", type=float, default=1, help="use nonlinearity in fc")
parser.add_argument("--optimizer", type=str, default="adam,lr=0.001", help="adam or sgd,lr=0.1")
parser.add_argument("--lrshrink", type=float, default=5, help="shrink factor for sgd")
parser.add_argument("--decay", type=float, default=0.99, help="lr decay")
parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr")
parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)")

# model
parser.add_argument("--encoder_type", type=str, default='InferSentV1', help="see list of encoders")
parser.add_argument("--enc_lstm_dim", type=int, default=768, help="encoder nhid dimension")  ##2048, help="encoder nhid dimension")
parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers")
parser.add_argument("--fc_dim", type=int, default=512, help="nhid of fc layers")
parser.add_argument("--n_classes", type=int, default=3, help="entailment/neutral/contradiction")
parser.add_argument("--pool_type", type=str, default='max', help="max or mean")

# gpu
parser.add_argument("--gpu_id", type=int, default=3, help="GPU ID")
parser.add_argument("--seed", type=int, default=1234, help="seed")

# data
parser.add_argument("--word_emb_dim", type=int, default=300, help="word embedding dimension")

params, _ = parser.parse_known_args()
config_nli_model = {
    'n_words'        :  1                     ,  ##len(word_vec)          ,
    'word_emb_dim'   :  params.word_emb_dim   ,
    'enc_lstm_dim'   :  params.enc_lstm_dim   ,
    'n_enc_layers'   :  params.n_enc_layers   ,
    'dpout_model'    :  params.dpout_model    ,
    'dpout_fc'       :  params.dpout_fc       ,
    'fc_dim'         :  params.fc_dim         ,
    'bsize'          :  params.batch_size     ,
    'n_classes'      :  params.n_classes      ,
    'pool_type'      :  params.pool_type      ,
    'nonlinear_fc'   :  params.nonlinear_fc   ,
    'encoder_type'   :  params.encoder_type   ,
    'use_cuda'       :  True                  ,

}
nli_net = NLINet(config_nli_model)
print(nli_net)

NLINet(
  (encoder): SentenceTransformer(
    (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (classifier): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_features=3072, out_features=512, bias=True)
    (2): Tanh()
    (3): Dropout(p=0.0, inplace=False)
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): Tanh()
    (6): Dropout(p=0.0, inplace=False)
    (7): Linear(in_features=512, out_features=3, bias=True)
  )
)


In [128]:
"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)

In [129]:
def get_optimizer(s):
    """
    Parse optimizer parameters.
    Input should be of the form:
        - "sgd,lr=0.01"
        - "adagrad,lr=0.1,lr_decay=0.05"
    """
    if "," in s:
        method = s[:s.find(',')]
        optim_params = {}
        for x in s[s.find(',') + 1:].split(','):
            split = x.split('=')
            assert len(split) == 2
            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
            optim_params[split[0]] = float(split[1])
    else:
        method = s
        optim_params = {}

    if method == 'adadelta':
        optim_fn = optim.Adadelta
    elif method == 'adagrad':
        optim_fn = optim.Adagrad
    elif method == 'adam':
        optim_fn = optim.Adam
    elif method == 'adamax':
        optim_fn = optim.Adamax
    elif method == 'asgd':
        optim_fn = optim.ASGD
    elif method == 'rmsprop':
        optim_fn = optim.RMSprop
    elif method == 'rprop':
        optim_fn = optim.Rprop
    elif method == 'sgd':
        optim_fn = optim.SGD
        assert 'lr' in optim_params
    else:
        raise Exception('Unknown optimization method: "%s"' % method)

    # check that we give good parameters to the optimizer
    #expected_args = inspect.getargspec(optim_fn.__init__)[0]
    #assert expected_args[:2] == ['self', 'params']
    #if not all(k in expected_args[2:] for k in optim_params.keys()):
    #    raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
    #        str(expected_args[2:]), str(optim_params.keys())))

    return optim_fn, optim_params

In [130]:
# loss
weight = torch.FloatTensor(params.n_classes).fill_(1)
loss_fn = nn.CrossEntropyLoss(weight=weight)
loss_fn.size_average = False

# optimizer
optim_fn, optim_params = get_optimizer(params.optimizer)
optimizer = optim_fn(nli_net.parameters(), **optim_params)

# cuda by default
nli_net.cuda()
loss_fn.cuda()

CrossEntropyLoss()

In [131]:
"""
TRAIN
"""
val_acc_best = -1e10
adam_stop = False
stop_training = False
lr = optim_params['lr'] if 'sgd' in params.optimizer else None

In [132]:
def get_batch(batch):  ##, word_vec, emb_dim=300):
    # sent in batch in decreasing order of lengths (bsize, max_len, word_dim)
    #lengths = np.array([len(x) for x in batch])
    #max_len = np.max(lengths)
    #embed = np.zeros((max_len, len(batch), emb_dim))
    #embed = np.zeros((768, len(batch)), dtype=np.float32)
    ##embed = np.zeros((len(batch), 768), dtype=np.float32)

    ##for i in range(len(batch)):
    ##    #for j in range(len(batch[i])):
    ##        #embed[j, i, :] = batch[i][j]  ##word_vec[batch[i][j]]
    ##    #print(batch)
    ##    emb = nli_net.encoder.encode(str(batch[i]))
    ##    for j in range(768):
    ##      embed[i, j] = emb[j]
          #print(emb[j].dtype)
    #print(embed.dtype)
    #return torch.from_numpy(embed).float(), lengths
    ##return torch.tensor(embed).cuda()
    return torch.tensor(nli_net.encoder.encode(batch)).cuda()

In [136]:
def trainepoch(epoch):
    print('\nTRAINING : Epoch ' + str(epoch))
    nli_net.train()
    all_costs = []
    logs = []
    words_count = 0
    
    if epoch == 1:
        for param in nli_net.encoder.parameters():
            param.requires_grad = False

    last_time = time.time()
    correct = 0.
    # shuffle the data
    permutation = np.random.permutation(len(train['Sentence1']))

    #s1 = train['Sentence1'][permutation]
    #s2 = train['Sentence2'][permutation]
    #target = train['label'][permutation]
    s1, s2, target = [], [], []
    for idx in permutation:
      s1.append(train['Sentence1'][idx])
      s2.append(train['Sentence2'][idx])
      target.append(train['label'][idx])


    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\
        and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr']
    print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))

    for stidx in range(0, len(s1), params.batch_size):
        # prepare batch
        #s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size],
        #                             word_vec, params.word_emb_dim)
        #s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size],
        #                             word_vec, params.word_emb_dim)
        #s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        #tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + params.batch_size])).cuda()
        #k = s1_batch.size(1)  # actual batch size

        s1_batch = get_batch(s1[stidx:stidx + params.batch_size])
        s2_batch = get_batch(s2[stidx:stidx + params.batch_size])
        tgt_batch = torch.LongTensor(target[stidx:stidx + params.batch_size]).cuda()
        k = len(s1_batch)
        
        #u = torch.tensor(nli_net.encoder.encode(s1_batch)).float().cuda()
        #v = torch.tensor(nli_net.encoder.encode(s2_batch)).float().cuda()
        #print(f"u: {u.shape}")

        # model forward
        #output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
        output = nli_net(s1_batch, s2_batch)

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
        #print(len(pred))
        #print(len(s1[stidx:stidx + params.batch_size]))
        assert len(pred) == len(s1[stidx:stidx + params.batch_size])

        # loss
        loss = loss_fn(output, tgt_batch)
        #print(type(loss))
        all_costs.append(loss.item())  #.data[0])
        words_count += (s1_batch.nelement() + s2_batch.nelement()) / params.word_emb_dim

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient clipping (off by default)
        #shrink_factor = 1
        #total_norm = 0

        #for p in nli_net.parameters():
        #    if p.requires_grad:
        #        p.grad.data.div_(k)  # divide by the actual batch size
        #        total_norm += p.grad.data.norm() ** 2
        #total_norm = np.sqrt(total_norm.cpu())

        #if total_norm > params.max_norm:
        #    shrink_factor = params.max_norm / total_norm
        #current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam)
        #optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update

        # optimizer step
        optimizer.step()
        #optimizer.param_groups[0]['lr'] = current_lr

        if len(all_costs) == 100:
            logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format(
                            stidx, round(np.mean(all_costs), 2),
                            int(len(all_costs) * params.batch_size / (time.time() - last_time)),
                            int(words_count * 1.0 / (time.time() - last_time)),
                            100.*correct/(stidx+k)))
            print(logs[-1])
            last_time = time.time()
            words_count = 0
            all_costs = []
    train_acc = 100 * correct/len(s1)  #round(100 * correct/len(s1), 2)
    print('results : epoch {0} ; mean accuracy train : {1}'
          .format(epoch, train_acc))
    return train_acc

In [137]:
def evaluate(epoch, eval_type='valid', final_eval=False):
    nli_net.eval()
    correct = 0.
    global val_acc_best, lr, stop_training, adam_stop

    if eval_type == 'valid':
        print('\nVALIDATION : Epoch {0}'.format(epoch))

    s1 = valid['Sentence1'] if eval_type == 'valid' else test['Sentence1']
    s2 = valid['Sentence2'] if eval_type == 'valid' else test['Sentence2']
    target = valid['label'] if eval_type == 'valid' else test['label']
    #print(s1[0:0 + params.batch_size])
    #print('preloop')

    for q in range(0, len(s1), params.batch_size):
        # prepare batch
        #s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec, params.word_emb_dim)
        #s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec, params.word_emb_dim)
        #s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        #tgt_batch = Variable(torch.LongTensor(target[i:i + params.batch_size])).cuda()
        #print(s1[q:q + params.batch_size])
        #print(q)
        #print(params.batch_size)
        s1_batch = get_batch(s1[q:q + params.batch_size].tolist())
        s2_batch = get_batch(s2[q:q + params.batch_size].tolist())
        tgt_batch = torch.LongTensor(target[q:q + params.batch_size].tolist()).cuda()

        # model forward
        #output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
        output = nli_net(s1_batch, s2_batch)

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()

        
    # save model
    eval_acc = 100 * correct/len(s1)  #round(100 * correct / len(s1), 2)
    if final_eval:
        print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc))
    else:
        print('togrep : results : epoch {0} ; mean accuracy {1} :\
              {2}'.format(epoch, eval_type, eval_acc))

    if eval_type == 'valid' and epoch <= params.n_epochs:
        if eval_acc > val_acc_best:
            print('saving model at epoch {0}'.format(epoch))
            if not os.path.exists(params.outputdir):
                os.makedirs(params.outputdir)
            torch.save(nli_net.state_dict(), os.path.join(params.outputdir,
                       params.outputmodelname))
            val_acc_best = eval_acc
        else:
            if 'sgd' in params.optimizer:
                optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / params.lrshrink
                print('Shrinking lr by : {0}. New lr = {1}'
                      .format(params.lrshrink,
                              optimizer.param_groups[0]['lr']))
                if optimizer.param_groups[0]['lr'] < params.minlr:
                    stop_training = True
            if 'adam' in params.optimizer:
                # early stopping (at 2nd decrease in accuracy)
                stop_training = adam_stop
                adam_stop = True
    return eval_acc


In [110]:
#for contuation of model training when needed
#optimizer.param_groups[0]['lr'] = 0.001

In [138]:
"""
Train model on Natural Language Inference task
"""
epoch = 1

while not stop_training and epoch <= params.n_epochs:
    train_acc = trainepoch(epoch)
    eval_acc = evaluate(epoch, 'valid')
    epoch += 1


TRAINING : Epoch 1
Learning rate : 0.001
12672 ; loss 0.71 ; sentence/s 363 ; words/s 1861 ; accuracy train : 69.375
25472 ; loss 0.69 ; sentence/s 458 ; words/s 2346 ; accuracy train : 70.0859375
38272 ; loss 0.66 ; sentence/s 454 ; words/s 2327 ; accuracy train : 70.94010162353516
51072 ; loss 0.64 ; sentence/s 454 ; words/s 2326 ; accuracy train : 71.505859375
63872 ; loss 0.63 ; sentence/s 453 ; words/s 2323 ; accuracy train : 71.9124984741211
76672 ; loss 0.64 ; sentence/s 453 ; words/s 2319 ; accuracy train : 72.125
89472 ; loss 0.63 ; sentence/s 454 ; words/s 2325 ; accuracy train : 72.33928680419922
102272 ; loss 0.63 ; sentence/s 450 ; words/s 2304 ; accuracy train : 72.5400390625
115072 ; loss 0.62 ; sentence/s 451 ; words/s 2313 ; accuracy train : 72.75086975097656
127872 ; loss 0.62 ; sentence/s 459 ; words/s 2354 ; accuracy train : 72.9242172241211
140672 ; loss 0.61 ; sentence/s 453 ; words/s 2323 ; accuracy train : 73.0632095336914
153472 ; loss 0.62 ; sentence/s 459 ; 

89472 ; loss 0.53 ; sentence/s 443 ; words/s 2272 ; accuracy train : 78.33928680419922
102272 ; loss 0.52 ; sentence/s 449 ; words/s 2303 ; accuracy train : 78.3232421875
115072 ; loss 0.52 ; sentence/s 453 ; words/s 2319 ; accuracy train : 78.32465362548828
127872 ; loss 0.53 ; sentence/s 445 ; words/s 2279 ; accuracy train : 78.36405944824219
140672 ; loss 0.53 ; sentence/s 442 ; words/s 2265 ; accuracy train : 78.39346313476562
153472 ; loss 0.52 ; sentence/s 451 ; words/s 2310 ; accuracy train : 78.421875
166272 ; loss 0.52 ; sentence/s 445 ; words/s 2280 ; accuracy train : 78.40023803710938
179072 ; loss 0.52 ; sentence/s 446 ; words/s 2286 ; accuracy train : 78.43526458740234
191872 ; loss 0.53 ; sentence/s 451 ; words/s 2312 ; accuracy train : 78.41093444824219
204672 ; loss 0.51 ; sentence/s 442 ; words/s 2267 ; accuracy train : 78.45361328125
217472 ; loss 0.51 ; sentence/s 440 ; words/s 2254 ; accuracy train : 78.48023986816406
230272 ; loss 0.52 ; sentence/s 450 ; words/s 23

166272 ; loss 0.46 ; sentence/s 454 ; words/s 2328 ; accuracy train : 80.85816955566406
179072 ; loss 0.47 ; sentence/s 453 ; words/s 2322 ; accuracy train : 80.83258819580078
191872 ; loss 0.48 ; sentence/s 451 ; words/s 2309 ; accuracy train : 80.78437805175781
204672 ; loss 0.48 ; sentence/s 451 ; words/s 2311 ; accuracy train : 80.7666015625
217472 ; loss 0.48 ; sentence/s 449 ; words/s 2303 ; accuracy train : 80.76103210449219
230272 ; loss 0.48 ; sentence/s 454 ; words/s 2328 ; accuracy train : 80.73828125
243072 ; loss 0.48 ; sentence/s 445 ; words/s 2282 ; accuracy train : 80.7006607055664
255872 ; loss 0.47 ; sentence/s 451 ; words/s 2312 ; accuracy train : 80.69804382324219
268672 ; loss 0.47 ; sentence/s 454 ; words/s 2327 ; accuracy train : 80.70572662353516
281472 ; loss 0.48 ; sentence/s 447 ; words/s 2291 ; accuracy train : 80.68359375
294272 ; loss 0.48 ; sentence/s 451 ; words/s 2309 ; accuracy train : 80.69055938720703
307072 ; loss 0.48 ; sentence/s 446 ; words/s 228

243072 ; loss 0.44 ; sentence/s 443 ; words/s 2268 ; accuracy train : 81.99835205078125
255872 ; loss 0.44 ; sentence/s 448 ; words/s 2298 ; accuracy train : 82.00508117675781
268672 ; loss 0.44 ; sentence/s 435 ; words/s 2227 ; accuracy train : 82.01487731933594
281472 ; loss 0.45 ; sentence/s 444 ; words/s 2275 ; accuracy train : 82.00603485107422
294272 ; loss 0.45 ; sentence/s 443 ; words/s 2271 ; accuracy train : 82.01087188720703
307072 ; loss 0.45 ; sentence/s 446 ; words/s 2286 ; accuracy train : 82.00162506103516
319872 ; loss 0.46 ; sentence/s 444 ; words/s 2275 ; accuracy train : 81.97125244140625
332672 ; loss 0.46 ; sentence/s 427 ; words/s 2190 ; accuracy train : 81.94261169433594
345472 ; loss 0.46 ; sentence/s 441 ; words/s 2262 ; accuracy train : 81.92679595947266
358272 ; loss 0.44 ; sentence/s 444 ; words/s 2275 ; accuracy train : 81.92159271240234
371072 ; loss 0.44 ; sentence/s 447 ; words/s 2290 ; accuracy train : 81.93885040283203
383872 ; loss 0.44 ; sentence/s 

319872 ; loss 0.43 ; sentence/s 152 ; words/s 778 ; accuracy train : 82.96937561035156
332672 ; loss 0.43 ; sentence/s 154 ; words/s 789 ; accuracy train : 82.95222473144531
345472 ; loss 0.42 ; sentence/s 153 ; words/s 787 ; accuracy train : 82.96701049804688
358272 ; loss 0.43 ; sentence/s 155 ; words/s 797 ; accuracy train : 82.95870208740234
371072 ; loss 0.44 ; sentence/s 154 ; words/s 789 ; accuracy train : 82.94207763671875
383872 ; loss 0.43 ; sentence/s 153 ; words/s 786 ; accuracy train : 82.92500305175781
396672 ; loss 0.43 ; sentence/s 154 ; words/s 788 ; accuracy train : 82.9070053100586
409472 ; loss 0.43 ; sentence/s 155 ; words/s 794 ; accuracy train : 82.889892578125
422272 ; loss 0.43 ; sentence/s 153 ; words/s 784 ; accuracy train : 82.88967895507812
435072 ; loss 0.43 ; sentence/s 153 ; words/s 788 ; accuracy train : 82.88304138183594
447872 ; loss 0.43 ; sentence/s 153 ; words/s 784 ; accuracy train : 82.89151763916016
460672 ; loss 0.44 ; sentence/s 154 ; words/s 

396672 ; loss 0.41 ; sentence/s 441 ; words/s 2261 ; accuracy train : 83.91104125976562
409472 ; loss 0.41 ; sentence/s 447 ; words/s 2290 ; accuracy train : 83.89990234375
422272 ; loss 0.4 ; sentence/s 447 ; words/s 2290 ; accuracy train : 83.89725494384766
435072 ; loss 0.42 ; sentence/s 444 ; words/s 2277 ; accuracy train : 83.87293243408203
447872 ; loss 0.42 ; sentence/s 434 ; words/s 2223 ; accuracy train : 83.8531265258789
460672 ; loss 0.42 ; sentence/s 441 ; words/s 2260 ; accuracy train : 83.82660675048828
473472 ; loss 0.4 ; sentence/s 446 ; words/s 2283 ; accuracy train : 83.8342514038086
486272 ; loss 0.41 ; sentence/s 447 ; words/s 2290 ; accuracy train : 83.83038330078125
499072 ; loss 0.42 ; sentence/s 454 ; words/s 2328 ; accuracy train : 83.80989837646484
511872 ; loss 0.42 ; sentence/s 447 ; words/s 2289 ; accuracy train : 83.796875
524672 ; loss 0.41 ; sentence/s 451 ; words/s 2310 ; accuracy train : 83.79020690917969
537472 ; loss 0.41 ; sentence/s 453 ; words/s 2

In [None]:
# Run best model on test set.
nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.outputmodelname)))

print('\nTEST : Epoch {0}'.format(epoch))
evaluate(1e6, 'valid', True)
evaluate(0, 'test', True)

# Save encoder instead of full model
torch.save(nli_net.encoder.state_dict(), os.path.join(params.outputdir, params.outputmodelname + '.encoder.pkl'))


TEST : Epoch 15

VALIDATION : Epoch 1000000.0
finalgrep : accuracy valid : 84.24100494384766
finalgrep : accuracy test : 84.41571807861328
