<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/neural_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Language Modeling

In [1]:
import os
import sys
sys.path.append('utils/')
import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math

import utils.ngram_utils as ngram_utils
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook
from utils.ngram_utils import NgramLM

In [2]:
torch.manual_seed(1)


<torch._C.Generator at 0x7f9766dd07b0>

In [3]:
USE_CUDA = True
device = torch.device("cuda" if USE_CUDA else "cpu")
N = 10

In [4]:
# Read data from .txt files and create lists of reviews
train_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
train_data = train_data[:20]
valid_data = valid_data[:2]

In [6]:
train_data[0], valid_data[0]

("this is a great tutu and at a really great price . it doesn ' t look cheap at all . i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly . a + + ",
 "these serve their purpose . they are very thin . . . so theyre great for flats or other shoes that you ' d need like a dress sock for . as for wearing them w / tennis shoes , id personally go w / a thicker no show sock . these also slip and fall off often with certain shoes . ")

In [7]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


20it [00:00, 593.72it/s]
2it [00:00, 475.41it/s]


In [8]:
train_ngram_lm = NgramLM(train_data_tokenized, all_tokens_train, n=1)
valid_ngram_lm = NgramLM(valid_data_tokenized, all_tokens_valid, n=1)

In [9]:
# TODO: make sure this makes sense for the valid data -- shouldn't we use the same dict for it?
train_unigram = train_ngram_lm.ngram_data
train_id2token_unigram = train_ngram_lm.id2token

valid_unigram = valid_ngram_lm.ngram_data
valid_id2token_unigram = valid_ngram_lm.id2token 

In [10]:
N = 10  # N-grams

In [11]:
train_unigram[0][0]

('this',)

In [12]:
# TODO: make sure this makes sense for the valid data -- shouldn't we use the same dict for it?
train_data_id = train_ngram_lm.create_data_id(train_unigram)
train_data_id_merged = train_ngram_lm.create_data_id_merged(train_data_id, N)

valid_data_id = train_ngram_lm.create_data_id(valid_unigram)
valid_data_id_merged = train_ngram_lm.create_data_id_merged(valid_data_id, N)

In [13]:
len(train_data_id_merged), len(valid_data_id_merged), train_data_id_merged[0]

(1048, 115, ([14, 15, 10, 18, 22, 7, 57, 10, 160, 18], 39))

In [14]:
train_dataset = AmazonDataset(train_data_id_merged, max_inp_length=None, use_cuda=True)
train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=batchify, shuffle=True)

valid_dataset = AmazonDataset(valid_data_id_merged, max_inp_length=None, use_cuda=True)
valid_loader = DataLoader(valid_dataset, batch_size=1024, collate_fn=batchify, shuffle=True)

100%|██████████| 1048/1048 [00:02<00:00, 387.19it/s]
100%|██████████| 115/115 [00:00<00:00, 11636.23it/s]


In [15]:
num_train = len(train_dataset)
num_valid = len(valid_dataset)

In [16]:
train_loader, train_dataset

(<torch.utils.data.dataloader.DataLoader at 0x7f96ee674208>,
 <utils.amazon_dataset.AmazonDataset at 0x7f96ede640f0>)

In [17]:
vocab_size = len(train_id2token_unigram)
vocab_size

362

In [18]:
encoder = BagOfNGrams(vocab_size, emb_dim=300, hidden_size=512, out_size=256, activation='ReLU', nlayers=1, reduce='mean', dropout=0.1, batch_norm=False)
encoder

BagOfNGrams(
  (embedding): EmbeddingBag(362, 300, mode=mean)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1)
    (3): Linear(in_features=512, out_features=256, bias=True)
  )
)

In [19]:
decoder = DecoderMLP(input_size=256, output_size=vocab_size, hidden_size=512)
decoder

DecoderMLP(
  (linear): Linear(in_features=256, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=362, bias=True)
  (log_softmax): LogSoftmax()
)

In [20]:
model = seq2seq(encoder, decoder, train_id2token_unigram, use_cuda=False, lr=0.1, size_ngrams=N)
model

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(362, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=512, out_features=256, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=256, out_features=512, bias=True)
    (out): Linear(in_features=512, out_features=362, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

## Training

In [21]:
num_epochs = 10
log_interval = 1
best_eval_loss = np.inf

for epoch in range(num_epochs):
    # Train
    train_loss = 0        
    for i, (data, labels) in tqdm(enumerate(train_loader), disable=True):
        prediction, loss = model.train_step(data, labels)
        train_loss += loss
        
    if epoch % log_interval == 0:
        train_loss = train_loss / num_train
        print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f}'.format(
                epoch, train_loss, math.exp(train_loss)))

        # Eval
        eval_loss = 0
        for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
            prediction, loss = model.train_step(data, labels, eval_mode=True)
            eval_loss += loss
        eval_loss = eval_loss / num_valid
        print('-' * 89)
        print('| Epoch {:3d} | Valid Loss {:5.2f} | Valid PPL {:8.2f}'.format(
            epoch, eval_loss, math.exp(eval_loss)))
        print('-' * 89)
        print('-' * 89)

        # Save the model if the validation loss is the best we've seen so far.
        if not best_eval_loss or eval_loss < best_eval_loss:
            with open('neural_lm_amazon_model' + '.pt', 'wb') as f:
                torch.save(model, f)
            best_eval_loss = eval_loss        


| Epoch   0 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   0 | Valid Loss  0.05 | Valid PPL     1.05
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| Epoch   1 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   1 | Valid Loss  0.05 | Valid PPL     1.05
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| Epoch   2 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   2 | Valid Loss  0.04 | Valid PPL     1.04
----------------------------------------------------------

## Scoring Sentences

In [29]:
use_context = True
score_only = True
K = 5
for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
    generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions
    
    for k in range(K):
        if use_context:
            context = [model.v2t(d) for d in data][k]
            context = [c[0] for c in context]
            print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        if not score_only:
            print("Generated ", generated_str[k])  # print only one generated sentence out of the bsz 
        print("Score:    ", math.exp(scores[k]))  # print only one generated sentence out of the bsz 
        print("")

Context:  dress < for . as for wearing them < <
Score:     0.3019151007147208

Context:  . these < < and < off often with <
Score:     0.2781901471450094

Context:  < a < < < < . these < <
Score:     0.47092630359412463

Context:  fits as < , < was very < , came
Score:     0.2812297585286127

Context:  came on < : the < is < < ,
Score:     0.28820639743830495



In [28]:
# test_sentences = ['dividend yields have been bolstered by stock declines', \
#              'stock bolstered declines dividend by yields have been', \
#              'artificial neural networks are computing systems vaguely inspired by the biological neural networks', \
#              'Cho is so cool', \
#              'roberta is so cool']
test_sentences = ['i like the', \
                 'Cho is so cool']
scores = score_sentences(test_sentences)
print(scores)

NameError: name 'score_sentences' is not defined

In [None]:
scores

## Generation

In [None]:
## No Context

In [30]:
use_context = False
score_only = False
for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
    generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions
    
    for k in range(K):
        if use_context:
            context = [model.v2t(d) for d in data][0]
            context = [c[0] for c in context]
            print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        print("Generated ", generated_str[0])  # print only one generated sentence out of the bsz 
        print("Score:    ", math.exp(scores[0]))  # print only one generated sentence out of the bsz 
        print("")

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998

Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.8308326368923998



In [None]:
## Context

In [32]:
use_context = True
score_only = False
for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
    generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions

    for k in range(K):
        if use_context:
            context = [model.v2t(d) for d in data][k]
            context = [c[0] for c in context]
            print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        print("Generated ", generated_str[k])  # print only one generated sentence out of the bsz 
        print("Score:    ", math.exp(scores[k]))  # print only one generated sentence out of the bsz 
        print("")

Context:  turquios and they are vibrant and beautiful ! the tutu
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.515628171842232

Context:  for this price and got exactly what i paid for
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.629380253071013

Context:  very good too since some of these go for over
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.6611461879324816

Context:  today and i ' m not a fan of it
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.6009064061528239

Context:  and at a really great price . it doesn '
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.7314167613070153

Context:  ! was hoping to order more in different colors .
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.691386742321122

Context:  price . it isn ' t a & # 34
Generated  . . . . . . . . . . . . . . . . . . . .
Score:     0.5948209214048114

Context:  < < yr . old . it will be too
Generated  < < < < < 

#### Removed