In [49]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

import inspect
import os
from random import randint
import time

import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import re

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load model

In [50]:
# Load model
from models import InferSent
model_version = 1
MODEL_PATH = "../encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [51]:
# Keep it on CPU or put it on GPU
use_cuda = torch.cuda.is_available()
#or force not to use cuda
#use_cuda = False
model = model.cuda() if use_cuda else model

In [52]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = '../GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [53]:
# Load embeddings of K most frequent words
#model.build_vocab_k_words(K=100000)

## Load sentences

In [54]:
# Load some sentences
#sentences = []
#with open('samples.txt') as f:
#    for line in f:
#        sentences.append(line.strip())
#print(len(sentences))

In [55]:
#sentences[:5]

## Encode sentences

In [56]:
# gpu mode : >> 1000 sentences/s
# cpu mode : ~100 sentences/s

In [57]:
#embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
#print('nb sentences encoded : {0}'.format(len(embeddings)))

## Visualization

In [58]:
#np.linalg.norm(model.encode(['the cat eats.']))

In [59]:
#def cosine(u, v):
#    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [60]:
#cosine(model.encode(['the cat eats.'])[0], model.encode(['the cat drinks.'])[0])

In [61]:
#idx = randint(0, len(sentences))
#_, _ = model.visualize(sentences[idx])

In [62]:
#my_sent = 'The cat is drinking milk.'
#_, _ = model.visualize(my_sent)

In [63]:
#model.build_vocab_k_words(500000) # getting 500K words vocab
#my_sent = 'barack-obama is the former president of the United-States.'
#_, _ = model.visualize(my_sent)

**InferSent inference**

In [64]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is F0F5-7230

 Directory of C:\Users\ktjam\YKT\MComp AI Classes\CS4248 Natural Language Processing\Github_project\4248-project\src

18/03/2023  06:38 pm    <DIR>          .
18/03/2023  04:59 pm    <DIR>          ..
18/03/2023  06:32 pm    <DIR>          .ipynb_checkpoints
18/03/2023  04:50 pm    <DIR>          __pycache__
18/03/2023  06:38 pm           123,974 demo_training.ipynb
18/03/2023  06:33 pm            10,486 eval_preds.py
01/03/2023  03:27 am            10,140 models.py
01/03/2023  03:27 am           590,791 samples.txt
18/03/2023  04:18 pm             4,395 test.ipynb
18/03/2023  04:19 pm           449,448 visualize.ipynb
               6 File(s)      1,189,234 bytes
               4 Dir(s)  47,275,888,640 bytes free


In [65]:
tmp1 = pd.read_csv('../dataset/esnli_train_1.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
tmp2 = pd.read_csv('../dataset/esnli_train_2.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
train = pd.concat([tmp1, tmp2], ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549367 entries, 0 to 549366
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   gold_label  549367 non-null  object
 1   Sentence1   549367 non-null  object
 2   Sentence2   549361 non-null  object
dtypes: object(3)
memory usage: 12.6+ MB


In [66]:
valid = pd.read_csv('../dataset/esnli_dev.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9842 entries, 0 to 9841
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  9842 non-null   object
 1   Sentence1   9842 non-null   object
 2   Sentence2   9842 non-null   object
dtypes: object(3)
memory usage: 230.8+ KB


In [67]:
test = pd.read_csv('../dataset/esnli_test.csv', usecols=['gold_label', 'Sentence1', 'Sentence2'])
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9824 entries, 0 to 9823
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  9824 non-null   object
 1   Sentence1   9824 non-null   object
 2   Sentence2   9824 non-null   object
dtypes: object(3)
memory usage: 230.4+ KB


In [68]:
#map label to int
label_to_int = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

In [69]:
#add label int
train['label'] = train['gold_label'].apply(lambda x: label_to_int[x])
valid['label'] = valid['gold_label'].apply(lambda x: label_to_int[x])
test['label'] = test['gold_label'].apply(lambda x: label_to_int[x])

In [70]:
def build_vocab(sentences, glove_path):
    word_dict = get_word_dict(sentences)
    word_vec = get_glove(word_dict, glove_path)
    print('Vocab size : {0}'.format(len(word_vec)))
    return word_vec

def get_glove(word_dict, glove_path):
    # create word_vec with glove vectors
    word_vec = {}
    with open(glove_path, encoding='utf8') as f:
        for line in f:
            #print(line)
            #break
            word, vec = line.split(' ', 1)
            if word in word_dict:
                word_vec[word] = np.array(list(map(float, vec.split())))
    print('Found {0}(/{1}) words with glove vectors'.format(
                len(word_vec), len(word_dict)))
    return word_vec

def get_word_dict(sentences):
    # create vocab of words
    word_dict = {}
    for sent in sentences:
        for word in str(sent).split():
            if word not in word_dict:
                word_dict[word] = ''
    word_dict['<s>'] = ''
    word_dict['</s>'] = ''
    word_dict['<p>'] = ''
    return word_dict

In [71]:
glove_path = '../GloVe/glove.840B.300d.txt'

In [72]:
#converts DataFrames to dict
train = train.to_dict(orient='list')
valid = valid.to_dict(orient='list')
test = test.to_dict(orient='list')

In [73]:
for i in range(1):
    print(train['Sentence2'][i])

A person is training his horse for a competition.


In [74]:
word_vec = build_vocab(train['Sentence1'] + train['Sentence2'] +
                       valid['Sentence1'] + valid['Sentence2'] +
                       test['Sentence1'] + test['Sentence2'], glove_path)

Found 37925(/64300) words with glove vectors
Vocab size : 37925


In [75]:
class NLINet(nn.Module):
    def __init__(self, config):
        super(NLINet, self).__init__()

        # classifier
        self.nonlinear_fc = config['nonlinear_fc']
        self.fc_dim = config['fc_dim']
        self.n_classes = config['n_classes']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.encoder_type = config['encoder_type']
        self.dpout_fc = config['dpout_fc']

        self.encoder = model  #eval(self.encoder_type)(config)
        self.inputdim = 4*2*self.enc_lstm_dim
        self.inputdim = 4*self.inputdim if self.encoder_type in \
                        ["ConvNetEncoder", "InnerAttentionMILAEncoder"] else self.inputdim
        self.inputdim = self.inputdim/2 if self.encoder_type == "LSTMEncoder" \
                                        else self.inputdim
        if self.nonlinear_fc:
            self.classifier = nn.Sequential(
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.inputdim, self.fc_dim),
                nn.Tanh(),
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.fc_dim, self.fc_dim),
                nn.Tanh(),
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.fc_dim, self.n_classes),
                )
        else:
            self.classifier = nn.Sequential(
                nn.Linear(self.inputdim, self.fc_dim),
                nn.Linear(self.fc_dim, self.fc_dim),
                nn.Linear(self.fc_dim, self.n_classes)
                )

    def forward(self, s1, s2):
        # s1 : (s1, s1_len)
        u = self.encoder(s1)
        v = self.encoder(s2)

        features = torch.cat((u, v, torch.abs(u-v), u*v), 1)
        output = self.classifier(features)
        return output

    def encode(self, s1):
        emb = self.encoder(s1)
        return emb


In [76]:
for split in ['Sentence1', 'Sentence2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] + \
            [word for word in str(sent).split() if word in word_vec] + \
            ['</s>'] for sent in eval(data_type)[split]])

  eval(data_type)[split] = np.array([['<s>'] + \


In [77]:
train['label'] = np.array(train['label'])

In [78]:
parser = argparse.ArgumentParser(description='NLI training')
# paths
parser.add_argument("--nlipath", type=str, default='dataset/SNLI/', help="NLI data path (SNLI or MultiNLI)")
parser.add_argument("--outputdir", type=str, default='../savedir/', help="Output directory")
parser.add_argument("--outputmodelname", type=str, default='model.pickle')
parser.add_argument("--word_emb_path", type=str, default="../dataset/GloVe/glove.840B.300d.txt", help="word embedding file path")

# training
parser.add_argument("--n_epochs", type=int, default=50)
parser.add_argument("--batch_size", type=int, default=64)  #64)
parser.add_argument("--dpout_model", type=float, default=0., help="encoder dropout")
parser.add_argument("--dpout_fc", type=float, default=0., help="classifier dropout")
parser.add_argument("--nonlinear_fc", type=float, default=0, help="use nonlinearity in fc")
parser.add_argument("--optimizer", type=str, default="sgd,lr=0.1", help="adam or sgd,lr=0.1")
parser.add_argument("--lrshrink", type=float, default=5, help="shrink factor for sgd")
parser.add_argument("--decay", type=float, default=0.99, help="lr decay")
parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr")
parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)")

# model
parser.add_argument("--encoder_type", type=str, default='InferSentV1', help="see list of encoders")
parser.add_argument("--enc_lstm_dim", type=int, default=2048, help="encoder nhid dimension")
parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers")
parser.add_argument("--fc_dim", type=int, default=512, help="nhid of fc layers")
parser.add_argument("--n_classes", type=int, default=3, help="entailment/neutral/contradiction")
parser.add_argument("--pool_type", type=str, default='max', help="max or mean")

# gpu
parser.add_argument("--gpu_id", type=int, default=3, help="GPU ID")
parser.add_argument("--seed", type=int, default=1234, help="seed")

# data
parser.add_argument("--word_emb_dim", type=int, default=300, help="word embedding dimension")

params, _ = parser.parse_known_args()
config_nli_model = {
    'n_words'        :  len(word_vec)          ,
    'word_emb_dim'   :  params.word_emb_dim   ,
    'enc_lstm_dim'   :  params.enc_lstm_dim   ,
    'n_enc_layers'   :  params.n_enc_layers   ,
    'dpout_model'    :  params.dpout_model    ,
    'dpout_fc'       :  params.dpout_fc       ,
    'fc_dim'         :  params.fc_dim         ,
    'bsize'          :  params.batch_size     ,
    'n_classes'      :  params.n_classes      ,
    'pool_type'      :  params.pool_type      ,
    'nonlinear_fc'   :  params.nonlinear_fc   ,
    'encoder_type'   :  params.encoder_type   ,
    'use_cuda'       :  True                  ,

}
nli_net = NLINet(config_nli_model)
print(nli_net)

NLINet(
  (encoder): InferSent(
    (enc_lstm): LSTM(300, 2048, bidirectional=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=16384, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=512, bias=True)
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)


In [79]:
"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)

In [80]:
def get_optimizer(s):
    """
    Parse optimizer parameters.
    Input should be of the form:
        - "sgd,lr=0.01"
        - "adagrad,lr=0.1,lr_decay=0.05"
    """
    if "," in s:
        method = s[:s.find(',')]
        optim_params = {}
        for x in s[s.find(',') + 1:].split(','):
            split = x.split('=')
            assert len(split) == 2
            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
            optim_params[split[0]] = float(split[1])
    else:
        method = s
        optim_params = {}

    if method == 'adadelta':
        optim_fn = optim.Adadelta
    elif method == 'adagrad':
        optim_fn = optim.Adagrad
    elif method == 'adam':
        optim_fn = optim.Adam
    elif method == 'adamax':
        optim_fn = optim.Adamax
    elif method == 'asgd':
        optim_fn = optim.ASGD
    elif method == 'rmsprop':
        optim_fn = optim.RMSprop
    elif method == 'rprop':
        optim_fn = optim.Rprop
    elif method == 'sgd':
        optim_fn = optim.SGD
        assert 'lr' in optim_params
    else:
        raise Exception('Unknown optimization method: "%s"' % method)

    # check that we give good parameters to the optimizer
    #expected_args = inspect.getargspec(optim_fn.__init__)[0]
    #assert expected_args[:2] == ['self', 'params']
    #if not all(k in expected_args[2:] for k in optim_params.keys()):
    #    raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
    #        str(expected_args[2:]), str(optim_params.keys())))

    return optim_fn, optim_params

In [81]:
# loss
weight = torch.FloatTensor(params.n_classes).fill_(1)
loss_fn = nn.CrossEntropyLoss(weight=weight)
loss_fn.size_average = False

# optimizer
optim_fn, optim_params = get_optimizer(params.optimizer)
optimizer = optim_fn(nli_net.parameters(), **optim_params)

# cuda by default
nli_net.cuda()
loss_fn.cuda()

CrossEntropyLoss()

In [82]:
"""
TRAIN
"""
val_acc_best = -1e10
adam_stop = False
stop_training = False
lr = optim_params['lr'] if 'sgd' in params.optimizer else None

In [83]:
def trainepoch(epoch):
    print('\nTRAINING : Epoch ' + str(epoch))
    nli_net.train()
    all_costs = []
    logs = []
    words_count = 0

    last_time = time.time()
    correct = 0.
    # shuffle the data
    permutation = np.random.permutation(len(train['Sentence1']))

    s1 = train['Sentence1'][permutation]
    s2 = train['Sentence2'][permutation]
    target = train['label'][permutation]


    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\
        and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr']
    print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))

    for stidx in range(0, len(s1), params.batch_size):
        # prepare batch
        s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size],
                                     word_vec, params.word_emb_dim)
        s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size],
                                     word_vec, params.word_emb_dim)
        s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + params.batch_size])).cuda()
        k = s1_batch.size(1)  # actual batch size
        
        # model forward
        output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
        assert len(pred) == len(s1[stidx:stidx + params.batch_size])

        # loss
        loss = loss_fn(output, tgt_batch)
        #print(type(loss))
        all_costs.append(loss.item())  #.data[0])
        words_count += (s1_batch.nelement() + s2_batch.nelement()) / params.word_emb_dim

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient clipping (off by default)
        shrink_factor = 1
        total_norm = 0

        for p in nli_net.parameters():
            if p.requires_grad:
                p.grad.data.div_(k)  # divide by the actual batch size
                total_norm += p.grad.data.norm() ** 2
        total_norm = np.sqrt(total_norm.cpu())

        if total_norm > params.max_norm:
            shrink_factor = params.max_norm / total_norm
        current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam)
        optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update

        # optimizer step
        optimizer.step()
        optimizer.param_groups[0]['lr'] = current_lr

        if len(all_costs) == 100:
            logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format(
                            stidx, round(np.mean(all_costs), 2),
                            int(len(all_costs) * params.batch_size / (time.time() - last_time)),
                            int(words_count * 1.0 / (time.time() - last_time)),
                            100.*correct/(stidx+k)))
            print(logs[-1])
            last_time = time.time()
            words_count = 0
            all_costs = []
    train_acc = 100 * correct/len(s1)  #round(100 * correct/len(s1), 2)
    print('results : epoch {0} ; mean accuracy train : {1}'
          .format(epoch, train_acc))
    return train_acc

In [84]:
def get_batch(batch, word_vec, emb_dim=300):
    # sent in batch in decreasing order of lengths (bsize, max_len, word_dim)
    lengths = np.array([len(x) for x in batch])
    max_len = np.max(lengths)
    embed = np.zeros((max_len, len(batch), emb_dim))

    for i in range(len(batch)):
        for j in range(len(batch[i])):
            embed[j, i, :] = word_vec[batch[i][j]]

    return torch.from_numpy(embed).float(), lengths

In [85]:
def evaluate(epoch, eval_type='valid', final_eval=False):
    nli_net.eval()
    correct = 0.
    global val_acc_best, lr, stop_training, adam_stop

    if eval_type == 'valid':
        print('\nVALIDATION : Epoch {0}'.format(epoch))

    s1 = valid['Sentence1'] if eval_type == 'valid' else test['Sentence1']
    s2 = valid['Sentence2'] if eval_type == 'valid' else test['Sentence2']
    target = valid['label'] if eval_type == 'valid' else test['label']

    for i in range(0, len(s1), params.batch_size):
        # prepare batch
        s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec, params.word_emb_dim)
        s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec, params.word_emb_dim)
        s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        tgt_batch = Variable(torch.LongTensor(target[i:i + params.batch_size])).cuda()

        # model forward
        output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()

        
    # save model
    eval_acc = 100 * correct/len(s1)  #round(100 * correct / len(s1), 2)
    if final_eval:
        print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc))
    else:
        print('togrep : results : epoch {0} ; mean accuracy {1} :\
              {2}'.format(epoch, eval_type, eval_acc))

    if eval_type == 'valid' and epoch <= params.n_epochs:
        if eval_acc > val_acc_best:
            print('saving model at epoch {0}'.format(epoch))
            if not os.path.exists(params.outputdir):
                os.makedirs(params.outputdir)
            torch.save(nli_net.state_dict(), os.path.join(params.outputdir,
                       params.outputmodelname))
            val_acc_best = eval_acc
        else:
            if 'sgd' in params.optimizer:
                optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / params.lrshrink
                print('Shrinking lr by : {0}. New lr = {1}'
                      .format(params.lrshrink,
                              optimizer.param_groups[0]['lr']))
                if optimizer.param_groups[0]['lr'] < params.minlr:
                    stop_training = True
            if 'adam' in params.optimizer:
                # early stopping (at 2nd decrease in accuracy)
                stop_training = adam_stop
                adam_stop = True
    return eval_acc


In [86]:
"""
Train model on Natural Language Inference task
"""
epoch = 1

while not stop_training and epoch <= params.n_epochs:
    train_acc = trainepoch(epoch)
    eval_acc = evaluate(epoch, 'valid')
    epoch += 1


TRAINING : Epoch 1
Learning rate : 0.1
6336 ; loss 1.1 ; sentence/s 330 ; words/s 17361 ; accuracy train : 34.046875
12736 ; loss 1.1 ; sentence/s 338 ; words/s 17634 ; accuracy train : 35.25
19136 ; loss 1.1 ; sentence/s 336 ; words/s 17780 ; accuracy train : 35.86979293823242
25536 ; loss 1.1 ; sentence/s 337 ; words/s 17631 ; accuracy train : 36.4140625
31936 ; loss 1.1 ; sentence/s 336 ; words/s 17569 ; accuracy train : 36.724998474121094
38336 ; loss 1.1 ; sentence/s 337 ; words/s 17707 ; accuracy train : 36.69791793823242
44736 ; loss 1.09 ; sentence/s 335 ; words/s 17772 ; accuracy train : 36.96205520629883
51136 ; loss 1.09 ; sentence/s 332 ; words/s 17605 ; accuracy train : 36.935546875
57536 ; loss 1.09 ; sentence/s 105 ; words/s 5412 ; accuracy train : 36.80208206176758
63936 ; loss 1.09 ; sentence/s 8 ; words/s 423 ; accuracy train : 36.85468673706055
70336 ; loss 1.09 ; sentence/s 293 ; words/s 15089 ; accuracy train : 37.041194915771484
76736 ; loss 1.09 ; sentence/s 340

44736 ; loss 0.97 ; sentence/s 336 ; words/s 17641 ; accuracy train : 60.47768020629883
51136 ; loss 0.97 ; sentence/s 333 ; words/s 17941 ; accuracy train : 60.509765625
57536 ; loss 0.97 ; sentence/s 334 ; words/s 17684 ; accuracy train : 60.54861068725586


KeyboardInterrupt: 

In [87]:
# Run best model on test set.
nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.outputmodelname)))

print('\nTEST : Epoch {0}'.format(epoch))
evaluate(1e6, 'valid', True)
evaluate(0, 'test', True)

# Save encoder instead of full model
torch.save(nli_net.encoder.state_dict(), os.path.join(params.outputdir, params.outputmodelname + '.encoder.pkl'))


TEST : Epoch 2

VALIDATION : Epoch 1000000.0
finalgrep : accuracy valid : 60.77016830444336
finalgrep : accuracy test : 60.78990173339844
