# Demo Notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
import torch

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = 'SentEval/'
# path to the NLP datasets 
PATH_TO_DATA = 'SentEval/data/'
# path to glove embeddings
PATH_TO_VEC = 'pretrained/glove.840B.300d.txt'

# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval
from encoders.word_embeddings_mean import WordEmbeddingsMeanEncoder
from utils.word_embeddings import create_dictionary, get_wordvec

In [3]:
import pickle

# 1. Run SentEval

In [15]:
encoder_to_load = "runs/exp_20240418_220928_lstm_256/model_5_checkpoint.pickle"

In [16]:
with open(encoder_to_load, 'rb') as f:
    sentence_encoder = torch.load(f).encoder

In [17]:
sentence_encoder

UnidirectionalLSTMEncoder(
  (lstm): LSTM(300, 256, batch_first=True)
)

In [18]:
from utils.word_embeddings import get_word_embeddings

In [19]:
# SentEval prepare and batcher
def prepare(params, samples):
    _, params.word2id = create_dictionary(samples)
    params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
    params.wvec_dim = 300
    params.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    return

def batcher(params, batch):
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        word_embeddings = torch.Tensor([get_word_embeddings(params.word_vec, sent)]).to(params.device)

        sentvec = sentence_encoder.forward((word_embeddings, [len(sent)]))[0].detach().cpu().numpy()
        embeddings.append(sentvec)

    embeddings = np.vstack(embeddings)
    return embeddings

In [22]:
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5, 'seed': 1111}
# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

se = senteval.engine.SE(params_senteval, batcher, prepare)

# define transfer tasks
transfer_tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC',
                  'SICKRelatedness', 'SICKEntailment', 'MRPC', 'STS14']

results = se.eval(transfer_tasks)
print(results)

2024-04-18 22:27:36,883 : ***** Transfer task : MR *****


2024-04-18 22:27:42,055 : Found 18490 words with word vectors, out of         20328 words
2024-04-18 22:27:42,067 : Generating sentence embeddings
2024-04-18 22:28:42,398 : Generated sentence embeddings
2024-04-18 22:28:42,399 : Training pytorch-MLP-nhid0-adam-bs64 with (inner) 5-fold cross-validation


AssertionError: Torch not compiled with CUDA enabled

In [21]:
results # WordMean: 0.746 accuracy

{'MRPC': {'devacc': 69.97,
  'acc': 69.16,
  'f1': 79.54,
  'ndev': 4076,
  'ntest': 1725}}

# 2. Run SNLI

In [54]:
from utils.snli_data import preprocess_text

[nltk_data] Downloading package punkt to /home/technet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
%%time
preprocess_text("""hello what's up? I miss you.""")

CPU times: user 8.93 ms, sys: 0 ns, total: 8.93 ms
Wall time: 9.09 ms


['hello', 'whats', 'up', 'i', 'miss', 'you']

# Train on SNLI

In [56]:
import torch.nn as nn
import torch

In [57]:
from utils.snli_data import get_snli_data
from heads.snli_model import SNLIClassifier

In [58]:
train = get_snli_data(split='train', sample=20_000)
valid = get_snli_data(split='dev')
test = get_snli_data(split='test')

reading train
sampling...
tokenizing sentence 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [00:01<00:00, 13154.55it/s]


tokenizing sentence 2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [00:01<00:00, 16971.33it/s]


reading dev
tokenizing sentence 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 13494.51it/s]


tokenizing sentence 2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 16647.49it/s]


reading test
tokenizing sentence 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 12932.83it/s]


tokenizing sentence 2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 16143.49it/s]


In [59]:
all_sentences = list(train['sentence1']) + list(train['sentence2']) +\
                list(valid['sentence1']) + list(valid['sentence2']) +\
                list(test['sentence1']) + list(test['sentence2'])

In [60]:
id2word, word2id = create_dictionary(all_sentences)

In [61]:
def get_longest_sentence(sentences):
    return np.max([len(s) for s in sentences])

In [62]:
MAX_LEN = get_longest_sentence(all_sentences)
MAX_LEN

64

In [111]:
word2vec = get_wordvec(PATH_TO_VEC, word2id)

In [117]:
nli_model = SNLIClassifier(encoder=BidirectionalLSTMEncoder(encoding_lstm_dim=256, pooling_type='max'), embedding_dim=512)

In [123]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=nli_model.parameters(), lr=0.1)

In [124]:
BATCH_SIZE = 64

In [125]:
from train_snli import train_one_epoch
from utils.eval import evaluate_model

In [126]:
for epoch in range(10):
    print('epoch', epoch)
    loss_batches = train_one_epoch(nli_model, train, optimizer, loss_fn, word2vec, BATCH_SIZE, tuple_input=True)
    print(np.mean(loss_batches))
    print(evaluate_model(nli_model, valid, word2vec, BATCH_SIZE, tuple_input=True))

epoch 0


313it [02:18,  2.26it/s]                                                                                                                                            


1.0934857518528216


154it [00:13, 11.08it/s]                                                                                                                                            


{'accuracy': 0.3330623856939646}
epoch 1


313it [01:58,  2.64it/s]                                                                                                                                            


1.0346849916842038


154it [00:11, 13.87it/s]                                                                                                                                            


{'accuracy': 0.3382442592968909}
epoch 2


313it [01:56,  2.68it/s]                                                                                                                                            


0.872105002212829


154it [00:11, 13.85it/s]                                                                                                                                            


{'accuracy': 0.5308880308880309}
epoch 3


313it [02:04,  2.51it/s]                                                                                                                                            


0.7557682097910311


154it [00:10, 14.67it/s]                                                                                                                                            


{'accuracy': 0.5159520422678318}
epoch 4


313it [02:04,  2.51it/s]                                                                                                                                            


0.6746816810922691


154it [00:10, 14.62it/s]                                                                                                                                            


{'accuracy': 0.6588091851249746}
epoch 5


 24%|██████████████████████████████▎                                                                                               | 75/312 [00:30<01:36,  2.47it/s]


KeyboardInterrupt: 

In [None]:
X