# Demo Notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
import torch

from encoders.word_embeddings_mean import WordEmbeddingsMeanEncoder
from encoders.lstm.unidirectional_lstm import UnidirectionalLSTMEncoder
from encoders.lstm.bidirectional_lstm import BidirectionalLSTMEncoder
from utils.word_embeddings import create_dictionary, get_wordvec, get_word_embeddings
from utils.snli_data import check_and_load_or_save, preprocess_text

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/technet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
PATH_TO_VEC = 'pretrained/glove.840B.300d.txt'

In [4]:
import pickle

# 1. Read 4 models and get word2vec

In [5]:
model_paths = {
    'word_mean': "runs/exp_20240418_162806_mean_embeddings_2048/model_9_checkpoint.pickle",
    'lstm': "runs/exp_20240418_145108_lstm_2048/model_9_checkpoint.pickle",
    'bilstm': "runs/exp_20240418_171206_bilstm_2048/model_7_checkpoint.pickle",
    'bilstm_max': "runs/exp_20240418_184433_bilstm_max_2048/model_6_checkpoint.pickle",
}

In [6]:
models = {}

In [7]:
for model_name, path in model_paths.items():
    with open(path, 'rb') as f:
        model = torch.load(f, map_location=torch.device('cpu'))

    models[model_name] = model

In [8]:
models

{'word_mean': SNLIClassifier(
   (encoder): WordEmbeddingsMeanEncoder()
   (fn1): Linear(in_features=1200, out_features=512, bias=True)
   (fn2): Linear(in_features=512, out_features=512, bias=True)
   (fn3): Linear(in_features=512, out_features=3, bias=True)
 ),
 'lstm': SNLIClassifier(
   (encoder): UnidirectionalLSTMEncoder(
     (lstm): LSTM(300, 2048, batch_first=True)
   )
   (fn1): Linear(in_features=8192, out_features=512, bias=True)
   (fn2): Linear(in_features=512, out_features=512, bias=True)
   (fn3): Linear(in_features=512, out_features=3, bias=True)
 ),
 'bilstm': SNLIClassifier(
   (encoder): BidirectionalLSTMEncoder(
     (lstm): LSTM(300, 2048, bidirectional=True)
   )
   (fn1): Linear(in_features=16384, out_features=512, bias=True)
   (fn2): Linear(in_features=512, out_features=512, bias=True)
   (fn3): Linear(in_features=512, out_features=3, bias=True)
 ),
 'bilstm_max': SNLIClassifier(
   (encoder): BidirectionalLSTMEncoder(
     (lstm): LSTM(300, 2048, bidirectiona

In [9]:
# Get word2vec
train = check_and_load_or_save('train')
valid = check_and_load_or_save('validation')
test = check_and_load_or_save('test')

all_sentences = list(train['sentence1']) + list(train['sentence2']) +\
                list(valid['sentence1']) + list(valid['sentence2']) +\
                list(test['sentence1']) + list(test['sentence2'])

# Align vocabulary
_, word2id = create_dictionary(all_sentences)
word2vec = get_wordvec(PATH_TO_VEC, word2id)

# 2. Infer the models

In [10]:
def infer_one_sample(model, premise, hypothesis):
    premise_tok = preprocess_text(premise)
    hypothesis_tok = preprocess_text(hypothesis)

    premise_torch = (torch.Tensor(np.array([get_word_embeddings(word2vec, premise_tok)])), [len(premise_tok)])
    hypothesis_torch = (torch.Tensor(np.array([get_word_embeddings(word2vec, hypothesis_tok)])), [len(hypothesis_tok)])

    logits = model.forward(premise_torch, hypothesis_torch)
    probabilities = torch.softmax(logits, -1)
    prediction = torch.argmax(logits).item()

    label_map = {
        0: 'entailment',
        1: 'neutral',
        2: 'contradiction'
    }
    return list(probabilities.detach().numpy()[0]), label_map[prediction]

In [18]:
infer_one_sample(models['word_mean'], 'Man is eating a sandwich in the park', 'Man is reading a book')

([0.023055643, 0.3956095, 0.5813348], 'contradiction')

In [22]:
def infer_all_models(premise, hypothesis):
    result = {}
    for model in models.keys():
        result[model] = infer_one_sample(models[model], premise, hypothesis)
    return result

In [23]:
infer_all_models('Man is eating a sandwich in the park', 'Man is reading a book')

{'word_mean': ([0.023055643, 0.3956095, 0.5813348], 'contradiction'),
 'lstm': ([0.011156958, 0.13168061, 0.8571624], 'contradiction'),
 'bilstm': ([0.00020725105, 0.006338245, 0.9934546], 'contradiction'),
 'bilstm_max': ([1.0176414e-07, 9.8397606e-05, 0.99990153], 'contradiction')}

# 3. Error Analysis

```
Premise - “Two men sitting in the sun”
Hypothesis - “Nobody is sitting in the shade”
Label - Neutral (likely predicts contradiction)

Premise - “A man is walking a dog”
Hypothesis - “No cat is outside”
Label - Neutral (likely predicts contradiction)
```
Can you think of a possible reason why the model would fail in such cases?

In [24]:
infer_all_models('Two men sitting in the sun', 'Nobody is sitting in the shade')

{'word_mean': ([0.03053329, 0.11450356, 0.8549632], 'contradiction'),
 'lstm': ([0.026673684, 0.042974908, 0.9303514], 'contradiction'),
 'bilstm': ([6.738626e-06, 6.264182e-05, 0.9999306], 'contradiction'),
 'bilstm_max': ([1.926938e-07, 7.1777176e-05, 0.999928], 'contradiction')}

In [25]:
infer_all_models('A man is walking a dog', 'No cat is outside')

{'word_mean': ([0.010695878, 0.002042266, 0.98726183], 'contradiction'),
 'lstm': ([3.6138215e-05, 0.00019624927, 0.9997676], 'contradiction'),
 'bilstm': ([1.2980183e-07, 0.0001886283, 0.99981123], 'contradiction'),
 'bilstm_max': ([2.5464145e-12, 1.1662495e-08, 1.0], 'contradiction')}

In [30]:
infer_all_models('A man is walking a dog', 'A woman is in the park') # All of them are wrong

{'word_mean': ([0.04507692, 0.38261545, 0.57230765], 'contradiction'),
 'lstm': ([0.00014721137, 0.05157209, 0.94828075], 'contradiction'),
 'bilstm': ([4.082773e-06, 0.0044974606, 0.99549854], 'contradiction'),
 'bilstm_max': ([5.953556e-10, 0.00016672764, 0.9998332], 'contradiction')}

In [35]:
infer_all_models('A man is walking a dog', 'A priest is in the park') # More are correct

{'word_mean': ([0.018409774, 0.7107579, 0.27083236], 'neutral'),
 'lstm': ([0.0027956907, 0.7963909, 0.20081346], 'neutral'),
 'bilstm': ([0.0034117987, 0.87175035, 0.12483776], 'neutral'),
 'bilstm_max': ([6.9189315e-07, 0.16198319, 0.8380161], 'contradiction')}

In [31]:
infer_all_models('A man is walking a dog', 'A woman is walking a dog') # Only wordmean is wrong

{'word_mean': ([0.77577835, 0.10899025, 0.11523131], 'entailment'),
 'lstm': ([0.0013807209, 0.008810549, 0.9898087], 'contradiction'),
 'bilstm': ([1.6555672e-05, 0.0003328938, 0.9996506], 'contradiction'),
 'bilstm_max': ([6.8913764e-10, 2.8412676e-06, 0.99999714], 'contradiction')}

In [32]:
infer_all_models('A man is walking a dog', 'A woman is walking') # All of them are wrong

{'word_mean': ([0.35800305, 0.142009, 0.49998796], 'contradiction'),
 'lstm': ([0.0029365458, 0.028069228, 0.9689942], 'contradiction'),
 'bilstm': ([4.6427413e-06, 0.0005357696, 0.99945956], 'contradiction'),
 'bilstm_max': ([5.06737e-07, 0.00033902953, 0.9996605], 'contradiction')}

In [50]:
infer_all_models('My family wants to buy a bike for my sister', 'My parents will spend money on a gift') # The models behave randomly

{'word_mean': ([0.02939803, 0.9426449, 0.027957119], 'neutral'),
 'lstm': ([0.51986384, 0.22466709, 0.25546902], 'entailment'),
 'bilstm': ([0.26923728, 0.28195828, 0.44880447], 'contradiction'),
 'bilstm_max': ([0.58331126, 0.40902793, 0.007660786], 'entailment')}

In [49]:
infer_all_models('My family wants to buy a bike for my sisters birthday', 'My parents will spend money on a gift') # The models behave randomly

{'word_mean': ([0.08498787, 0.89756477, 0.017447362], 'neutral'),
 'lstm': ([0.7754753, 0.16892886, 0.05559586], 'entailment'),
 'bilstm': ([0.58978903, 0.22205001, 0.18816093], 'entailment'),
 'bilstm_max': ([0.9195709, 0.07983702, 0.0005921151], 'entailment')}

In [52]:
infer_all_models('My family dont want to buy anything for my sisters birthday', 'My parents will spend money on a gift') # Should be contradiction

{'word_mean': ([0.07764516, 0.8997866, 0.02256826], 'neutral'),
 'lstm': ([0.51014715, 0.24480397, 0.24504882], 'entailment'),
 'bilstm': ([0.5663025, 0.14585175, 0.28784576], 'entailment'),
 'bilstm_max': ([0.8618197, 0.13807319, 0.0001071088], 'entailment')}

In [40]:
infer_all_models('I love paris', 'I hate france') # Neutral

{'word_mean': ([0.0011365338, 0.99360716, 0.0052563264], 'neutral'),
 'lstm': ([0.019556528, 0.07358318, 0.9068603], 'contradiction'),
 'bilstm': ([0.00017161231, 0.0032375522, 0.99659085], 'contradiction'),
 'bilstm_max': ([5.5866956e-10, 1.6317714e-05, 0.99998367], 'contradiction')}

In [43]:
infer_all_models('I love paris', 'I love only small towns') # Neutral

{'word_mean': ([0.08336903, 0.6966773, 0.2199537], 'neutral'),
 'lstm': ([0.086162046, 0.7196629, 0.19417505], 'neutral'),
 'bilstm': ([0.0067749424, 0.3206998, 0.6725253], 'contradiction'),
 'bilstm_max': ([1.3852084e-06, 0.039719153, 0.9602795], 'contradiction')}

In [47]:
infer_all_models('My grandma is sad', '

{'word_mean': ([0.038690995, 0.86514705, 0.09616197], 'neutral'),
 'lstm': ([0.046770196, 0.3071708, 0.646059], 'contradiction'),
 'bilstm': ([0.11255712, 0.49272642, 0.39471638], 'neutral'),
 'bilstm_max': ([0.008850306, 0.09404798, 0.8971017], 'contradiction')}

In [54]:
infer_all_models('I am going on a movie date', 'I am buying two cinema tickets')

{'word_mean': ([0.00041773522, 0.64379215, 0.3557901], 'neutral'),
 'lstm': ([0.006686811, 0.43668097, 0.5566322], 'contradiction'),
 'bilstm': ([0.004699665, 0.5166638, 0.4786366], 'neutral'),
 'bilstm_max': ([0.00012682509, 0.42931905, 0.5705541], 'contradiction')}

In [55]:
infer_all_models('I am taking my girlfriend to cinema', 'I am buying two cinema tickets')

{'word_mean': ([0.0025424368, 0.7511353, 0.2463223], 'neutral'),
 'lstm': ([0.019003248, 0.25004435, 0.73095244], 'contradiction'),
 'bilstm': ([0.00025300932, 0.019398943, 0.98034805], 'contradiction'),
 'bilstm_max': ([0.0011349571, 0.84580195, 0.15306313], 'neutral')}

In [57]:
infer_all_models('I am taking my girlfriend to cinema', 'I need a cinema ticket')

{'word_mean': ([0.019097535, 0.641643, 0.3392595], 'neutral'),
 'lstm': ([0.22093424, 0.49208048, 0.28698525], 'neutral'),
 'bilstm': ([0.09309655, 0.62963855, 0.2772649], 'neutral'),
 'bilstm_max': ([0.07083751, 0.919686, 0.009476442], 'neutral')}

In [58]:
infer_all_models('I am going to cinema', 'I need a cinema ticket')

{'word_mean': ([0.027409172, 0.8130543, 0.15953659], 'neutral'),
 'lstm': ([0.18991871, 0.7525583, 0.05752297], 'neutral'),
 'bilstm': ([0.102032915, 0.87470984, 0.023257235], 'neutral'),
 'bilstm_max': ([0.0037163459, 0.9957919, 0.0004917366], 'neutral')}

In [62]:
infer_all_models('I am going to cinema', 'I am buying a cinema ticket')

{'word_mean': ([0.004963711, 0.9267899, 0.068246394], 'neutral'),
 'lstm': ([0.045607433, 0.76051766, 0.19387487], 'neutral'),
 'bilstm': ([0.014085018, 0.7057853, 0.28012976], 'neutral'),
 'bilstm_max': ([5.054657e-05, 0.9667839, 0.033165544], 'neutral')}

In [64]:
infer_all_models('I am going to cinema', 'I am buying a cinema ticket')

{'word_mean': ([0.004963711, 0.9267899, 0.068246394], 'neutral'),
 'lstm': ([0.045607433, 0.76051766, 0.19387487], 'neutral'),
 'bilstm': ([0.014085018, 0.7057853, 0.28012976], 'neutral'),
 'bilstm_max': ([5.054657e-05, 0.9667839, 0.033165544], 'neutral')}

In [65]:
infer_all_models('I am going to cinema', 'I have a cinema ticket')

{'word_mean': ([0.039097182, 0.86237335, 0.098529525], 'neutral'),
 'lstm': ([0.20357032, 0.72780764, 0.06862202], 'neutral'),
 'bilstm': ([0.14418097, 0.8466314, 0.00918759], 'neutral'),
 'bilstm_max': ([0.027642695, 0.9719738, 0.00038354637], 'neutral')}

In [66]:
infer_all_models('I am going to museum', 'I have a museum ticket')

{'word_mean': ([0.0042049983, 0.9486124, 0.047182575], 'neutral'),
 'lstm': ([0.1866317, 0.7707157, 0.042652562], 'neutral'),
 'bilstm': ([0.027998252, 0.96288687, 0.009114873], 'neutral'),
 'bilstm_max': ([0.01825204, 0.9810357, 0.0007122529], 'neutral')}

In [63]:
'cinema' in word2id

True

# 4. Calculate Result Metrics

In [4]:
import numpy as np

def macro(results):
    return np.mean([d['devacc'] for d in transfer_results.values()])

def micro(results):
    sum_all_dev = np.sum([d['ndev'] for d in transfer_results.values()])
    return np.sum([(d['devacc'] * d['ndev']) for d in transfer_results.values()]) / sum_all_dev

### 1. WordMean

In [5]:
transfer_results = {'MR': {'devacc': 75.26, 'acc': 75.31, 'ndev': 10662, 'ntest': 10662},
 'CR': {'devacc': 81.56, 'acc': 81.43, 'ndev': 3775, 'ntest': 3775},
 'SUBJ': {'devacc': 90.39, 'acc': 90.71, 'ndev': 10000, 'ntest': 10000},
 'MPQA': {'devacc': 85.43, 'acc': 85.61, 'ndev': 10606, 'ntest': 10606},
 'SST2': {'devacc': 78.56, 'acc': 79.35, 'ndev': 872, 'ntest': 1821},
 'TREC': {'devacc': 75.92, 'acc': 77.2, 'ndev': 5452, 'ntest': 500},
 'MRPC': {'devacc': 73.48, 'acc': 74.32, 'f1': 82.26, 'ndev': 4076, 'ntest': 1725}, 'SICKEntailment': {'devacc': 84.8, 'acc': 85.2, 'ndev': 500, 'ntest': 4927}}

assert len(transfer_results) == 8

In [6]:
# BILSTM MAX
macro(transfer_results), micro(transfer_results)

(80.675, 81.50548157499512)

In [25]:
# BILSTM (78.80375000000001, 79.52509936225323)
macro(transfer_results), micro(transfer_results)

(78.80375000000001, 79.52509936225323)

In [23]:
# LSTM (76.8725, 77.39084082449992)
macro(transfer_results), micro(transfer_results)

(76.8725, 77.39084082449992)

In [21]:
# WordMean (78.36500000000001, 79.68296671963084)
macro(transfer_results), micro(transfer_results)

(78.36500000000001, 79.68296671963084)