<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/neural_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Language Modeling

In [1]:
import os
import sys
sys.path.append('utils/')
import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math

import utils.ngram_utils as ngram_utils
from utils.ngram_utils import NgramLM
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook

In [2]:
torch.manual_seed(1)


<torch._C.Generator at 0x7f1b080207b0>

In [3]:
use_cuda = True
device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")


In [4]:
# Read data from .txt files and create lists of reviews
train_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
# train_data = split_into_sentences(train_data)
# valid_data = split_into_sentences(valid_data)

In [6]:
len(train_data), len(valid_data)

(222919, 27869)

In [7]:
train_data[0], valid_data[0]
train_data = train_data[:100]
valid_data = valid_data[:10]
train_data[0], type(train_data), len(train_data), type(train_data[0])

("this is a great tutu and at a really great price . it doesn ' t look cheap at all . i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly . a + + ",
 list,
 100,
 str)

In [8]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [9]:
train_data_tokenized[0], all_tokens_train[0]

(['this',
  'is',
  'a',
  'great',
  'tutu',
  'and',
  'at',
  'a',
  'really',
  'great',
  'price',
  '.'],
 'this')

In [10]:
N = 5

In [11]:
train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)

In [12]:
train_data_padded[0]

['<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 'this',
 'is',
 'a',
 'great',
 'tutu',
 'and',
 'at',
 'a',
 'really',
 'great',
 'price',
 '.',
 '<eos>',
 '<eos>',
 '<eos>',
 '<eos>']

In [13]:
vocab = ngram_utils.get_vocab(train_data_padded)
vocab_size = len(vocab)
vocab_size, vocab[:10]

(1980, ('<sos>', '<eos>', '.', ',', 'the', 'i', 'to', 'and', 'a', 'it'))

In [14]:
id2token, token2id = ngram_utils.get_dict(vocab)

In [15]:
train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)

In [16]:
train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
train_dataset_ngrams = []
for t in train_dataset:
    for i in range(len(t) - N):
        train_dataset_ngrams.append((t[i:i + N], t[i + N]))
train_loader = DataLoader(train_dataset_ngrams, batch_size=1024, collate_fn=batchify, shuffle=True)

100%|██████████| 844/844 [00:03<00:00, 278.40it/s]


In [17]:
# for t in train_dataset_ngrams:
#     print(t)
#     break

In [18]:
# train_data_ids[0]

In [19]:
# for i, (d, l) in enumerate(train_loader):
#     import pdb; pdb.set_trace()
#     print(d)
#     print(l)
#     break

In [20]:
valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
valid_dataset_ngrams = []
for t in valid_dataset:
    for i in range(len(t) - N):
        valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
valid_loader = DataLoader(valid_dataset_ngrams, batch_size=1024, collate_fn=batchify, shuffle=True)


100%|██████████| 39/39 [00:00<00:00, 14661.46it/s]


In [21]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f1a32aa4b38>

In [22]:
num_train = len(train_dataset_ngrams)
num_valid = len(valid_dataset_ngrams)
num_train, num_valid

(18235, 659)

In [23]:
encoder = BagOfNGrams(len(id2token), emb_dim=300, hidden_size=512, out_size=256, activation='ReLU', nlayers=1, reduce='mean', dropout=0.1, batch_norm=False)
encoder

BagOfNGrams(
  (embedding): EmbeddingBag(1984, 300, mode=mean)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1)
    (3): Linear(in_features=512, out_features=256, bias=True)
  )
)

In [24]:
decoder = DecoderMLP(input_size=256, output_size=len(id2token), hidden_size=512)
decoder

DecoderMLP(
  (linear): Linear(in_features=256, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=1984, bias=True)
  (log_softmax): LogSoftmax()
)

In [25]:
model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N)
model

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(1984, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=512, out_features=256, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=256, out_features=512, bias=True)
    (out): Linear(in_features=512, out_features=1984, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

## Training

In [26]:
num_epochs = 3
log_interval = 1
best_eval_loss = np.inf

for epoch in range(num_epochs):
    # Train
    train_loss = 0        
    for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
        prediction, loss = model.train_step(data, labels)
        train_loss += loss
    train_loss = train_loss / num_train
    print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f}'.format(
            epoch, train_loss, math.exp(train_loss)))

    # Eval
    if epoch % log_interval == 0:        
        eval_loss = 0
        for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
            prediction, loss = model.train_step(data, labels, eval_mode=True)
            eval_loss += loss
        eval_loss = eval_loss / num_valid
        print('-' * 89)
        print('| Epoch {:3d} | Valid Loss {:5.2f} | Valid PPL {:8.2f}'.format(
            epoch, eval_loss, math.exp(eval_loss)))
        print('-' * 89)
        print('-' * 89)

        # Save the model if the validation loss is the best we've seen so far.
        if not best_eval_loss or eval_loss < best_eval_loss:
            with open('neural_lm_amazon_model' + '.pt', 'wb') as f:
                torch.save(model, f)
            best_eval_loss = eval_loss        


| Epoch   0 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   0 | Valid Loss  0.01 | Valid PPL     1.01
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| Epoch   1 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   1 | Valid Loss  0.01 | Valid PPL     1.01
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| Epoch   2 | Train Loss  0.00 | Train PPL     1.00
-----------------------------------------------------------------------------------------
| Epoch   2 | Valid Loss  0.01 | Valid PPL     1.01
----------------------------------------------------------

## Scoring Sentences

In [32]:
def score_sentence(sent):
    import pdb; pdb.set_trace()
    tokenized, _ = ngram_utils.tokenize_dataset(sent)
    sent_ids = ngram_utils.get_ids(tokenized, token2id)
    sent_tensor = torch.LongTensor(sent_ids).to(device)
    generated, scores = model.eval_step(sent_tensor, score_only=True)
    ppl = math.exp(scores)
    return ppl

In [33]:
sent = ['this is a great tutu']
type(sent), len(sent), type(sent[0]), sent[0]
ppl = score_sentence(sent)
ppl

> <ipython-input-32-6f4884bc8cbd>(3)score_sentence()
-> tokenized, _ = ngram_utils.tokenize_dataset(sent)


(Pdb)  c


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(156)eval_step()
-> if xs is None:


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(167)eval_step()
-> if score_only or not use_context:


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(214)eval_step()
-> predictions = [self.v2t(p) for p in predictions]


(Pdb)  predictions


[tensor([18]), tensor([16]), tensor([12]), tensor([63]), tensor([191])]


(Pdb)  predictions.shape


*** AttributeError: 'list' object has no attribute 'shape'


(Pdb)  len(predictions)


5


(Pdb)  c


0.0456609491681853

In [38]:
def generate_sentence(context=None):
    import pdb; pdb.set_trace()
    if context is None:
        dummy_context = torch.LongTensor([[0]]).to(device)
        generated, scores = model.eval_step(dummy_context, use_context=False)
    else:
        tokenized, _ = ngram_utils.tokenize_dataset(context)
        context_ids = ngram_utils.get_ids(tokenized, token2id)
        context_tensor = torch.LongTensor(context_ids).to(device)
        generated, scores = model.eval_step(context_tensor, use_context=True)
    
    ppl = math.exp(scores)
    return generated, scores

In [39]:
generated, scores = generate_sentence()
generated_str = [' '.join(g) for g in generated]
generated_str

> <ipython-input-38-137dbbbf69b1>(3)generate_sentence()
-> if context is None:


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(156)eval_step()
-> if xs is None:


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(167)eval_step()
-> if score_only or not use_context:


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(214)eval_step()
-> predictions = [self.v2t(p) for p in predictions]


(Pdb)  len(predictions)


10


(Pdb)  predictions[0]


tensor([7])


(Pdb)  c


[',', "'", "'", "'", "'", 't', 't', 't', '.', '<eos>']

In [40]:
generated, scores = generate_sentence(context=['this is a great tutu'])
generated_str = [' '.join(g) for g in generated]
generated_str

> <ipython-input-38-137dbbbf69b1>(3)generate_sentence()
-> if context is None:


(Pdb)  n


> <ipython-input-38-137dbbbf69b1>(7)generate_sentence()
-> tokenized, _ = ngram_utils.tokenize_dataset(context)


(Pdb)  context


['this is a great tutu']


(Pdb)  n


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


> <ipython-input-38-137dbbbf69b1>(8)generate_sentence()
-> context_ids = ngram_utils.get_ids(tokenized, token2id)


(Pdb)  n


> <ipython-input-38-137dbbbf69b1>(9)generate_sentence()
-> context_tensor = torch.LongTensor(context_ids).to(device)


(Pdb)  context_ids


[[18, 16, 12, 63, 191]]


(Pdb)  n


> <ipython-input-38-137dbbbf69b1>(10)generate_sentence()
-> generated, scores = model.eval_step(context_tensor, use_context=True)


(Pdb)  context_tensor


tensor([[ 18,  16,  12,  63, 191]], device='cuda:0')


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(156)eval_step()
-> if xs is None:


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(158)eval_step()
-> xs = xs.to(self.device)


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(159)eval_step()
-> bsz = xs.size(0)


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(162)eval_step()
-> self.encoder.eval()


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(163)eval_step()
-> self.decoder.eval()


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(165)eval_step()
-> import pdb; pdb.set_trace()


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(167)eval_step()
-> if score_only or not use_context:


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(171)eval_step()
-> encoder_input = xs   # this needs to be of shape bsz, self.size_ngrams


(Pdb)  n


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(173)eval_step()
-> predictions = []


(Pdb)  encoder_input


tensor([[ 18,  16,  12,  63, 191]])


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(214)eval_step()
-> predictions = [self.v2t(p) for p in predictions]


(Pdb)  c


['.', '<eos>']

In [1]:
# use_context = True
# score_only = True
# K = 5
# for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
#     import pdb; pdb.set_trace()
#     generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions
#     for k in range(K):
#         if use_context:
#             context = [model.v2t(d) for d in data][k]
#             context = [c[0] for c in context]
#             print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
#         generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
#         if not score_only:
#             print("Generated ", generated_str[k])  # print only one generated sentence out of the bsz 
#         print("Score:    ", math.exp(scores[k]))  # print only one generated sentence out of the bsz 
#         print("")

## Generation

### No Context

In [None]:
# use_context = False
# score_only = False
# for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
#     generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions
#     import pdb; pdb.set_trace()
#     for k in range(K):
#         if use_context:
#             context = [model.v2t(d) for d in data][0]
#             context = [c[0] for c in context]
#             print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
#         generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
#         print("Generated ", generated_str[0])  # print only one generated sentence out of the bsz 
#         print("Score:    ", math.exp(scores[0]))  # print only one generated sentence out of the bsz 
#         print("")

### Context

In [None]:
# use_context = True
# score_only = False
# for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
#     generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions

#     for k in range(K):
#         if use_context:
#             context = [model.v2t(d) for d in data][k]
#             context = [c[0] for c in context]
#             print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
#         generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
#         print("Generated ", generated_str[k])  # print only one generated sentence out of the bsz 
#         print("Score:    ", math.exp(scores[k]))  # print only one generated sentence out of the bsz 
#         print("")