<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/neural_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Language Modeling

In [1]:
import os
import sys
sys.path.append('utils/')

In [2]:
import utils.ngram_utils as ngram_utils
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook

In [3]:
torch.manual_seed(1)


<torch._C.Generator at 0x7f33b332a510>

In [4]:
# Read data from .txt files and create lists of reviews

train_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
len(train_data), len(valid_data)

(222919, 27869)

In [6]:
train_data = train_data[:100000]
valid_data = valid_data[:10000]

In [7]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


100000it [00:27, 3627.82it/s]
10000it [00:01, 5109.15it/s]


In [8]:
gl.SOS_TOKEN
train_padded_uni = ngram_utils.pad_sentences(train_data_tokenized, 2)
train_padded_uni[0]

['<sos>',
 'this',
 'is',
 'a',
 'great',
 'tutu',
 'and',
 'at',
 'a',
 'really',
 'great',
 'price',
 '.',
 'it',
 'doesn',
 "'",
 't',
 'look',
 'cheap',
 'at',
 'all',
 '.',
 'i',
 "'",
 'm',
 'so',
 'glad',
 'i',
 'looked',
 'on',
 'amazon',
 'and',
 'found',
 'such',
 'an',
 'affordable',
 'tutu',
 'that',
 'isn',
 "'",
 't',
 'made',
 'poorly',
 '.',
 'a',
 '+',
 '+',
 '<eos>']

In [9]:
# n = 1
train_padded_uni = ngram_utils.pad_sentences(train_data_tokenized, 1)
train_unigram = ngram_utils.find_ngrams(train_padded_uni, 1)
train_vocab_unigram, train_count_unigram = ngram_utils.ngram_counts(train_unigram)
train_id2token_unigram, train_token2id_unigram = ngram_utils.ngram_dict(train_vocab_unigram)

valid_padded_uni = ngram_utils.pad_sentences(valid_data_tokenized, 1)
valid_unigram = ngram_utils.find_ngrams(valid_padded_uni, 1)
valid_vocab_unigram, count_unigram = ngram_utils.ngram_counts(valid_unigram)
valid_id2token_unigram, valid_token2id_unigram = ngram_utils.ngram_dict(valid_vocab_unigram)

In [10]:
train_id2token_unigram[:10]

['<pad>',
 '<unk>',
 '<sos>',
 '<eos>',
 ('.',),
 ('the',),
 ('i',),
 (',',),
 ('and',),
 ('a',)]

In [11]:
N = 10
train_data_id = ngram_utils.create_data_id(train_unigram, train_token2id_unigram)
train_data_id_merged = ngram_utils.create_data_id_merged(train_data_id, train_token2id_unigram, N)

valid_data_id = ngram_utils.create_data_id(valid_unigram, valid_token2id_unigram)
valid_data_id_merged = ngram_utils.create_data_id_merged(valid_data_id, valid_token2id_unigram, N)

In [20]:
len(train_data_id_merged), len(valid_data_id_merged), train_data_id_merged[0]

(6668265, 558113, ([19, 13, 9, 41, 4174, 8, 57, 9, 72, 41], 91))

In [13]:
train_dataset = AmazonDataset(train_data_id_merged, max_inp_length=None, use_cuda=True)
train_loader = DataLoader(train_dataset, batch_size=4096, collate_fn=batchify, shuffle=True)

valid_dataset = AmazonDataset(valid_data_id_merged, max_inp_length=None, use_cuda=True)
valid_loader = DataLoader(valid_dataset, batch_size=4096, collate_fn=batchify, shuffle=True)

100%|██████████| 6668265/6668265 [05:10<00:00, 21481.12it/s]
100%|██████████| 558113/558113 [00:24<00:00, 22555.39it/s]


In [19]:
train_dataset, train_loader

(<utils.amazon_dataset.AmazonDataset at 0x7f3297f797b8>,
 <torch.utils.data.dataloader.DataLoader at 0x7f3316e343c8>)

In [21]:
train_data_id_merged[:5]

[([19, 13, 9, 41, 4174, 8, 57, 9, 72, 41], 91),
 ([13, 9, 41, 4174, 8, 57, 9, 72, 41, 91], 4),
 ([9, 41, 4174, 8, 57, 9, 72, 41, 91, 4], 11),
 ([41, 4174, 8, 57, 9, 72, 41, 91, 4, 11], 228),
 ([4174, 8, 57, 9, 72, 41, 91, 4, 11, 228], 12)]

In [22]:
train_dataset[0], train_dataset[0][0].shape, \
valid_dataset[0], valid_dataset[0][0].shape

((tensor([  19,   13,    9,   41, 4174,    8,   57,    9,   72,   41],
         device='cuda:0'), tensor([91], device='cuda:0'), 10),
 torch.Size([10]),
 (tensor([  33, 2555,  309,  936,    4,   21,   22,   25,  143,    4],
         device='cuda:0'), tensor([4], device='cuda:0'), 10),
 torch.Size([10]))

In [27]:
len(train_id2token_unigram), train_id2token_unigram[:10]

(44631,
 ['<pad>',
  '<unk>',
  '<sos>',
  '<eos>',
  ('.',),
  ('the',),
  ('i',),
  (',',),
  ('and',),
  ('a',)])

In [None]:
num_epochs = 500
encoder = BagOfNGrams(len(train_id2token_unigram), emb_dim=300, hidden_size=512, out_size=256, activation='Tanh', nlayers=1, reduce='mean', dropout=0.0, batch_norm=False)
decoder = DecoderMLP(input_size=256, output_size=len(train_id2token_unigram), hidden_size=512)
model = seq2seq(encoder, decoder, train_id2token_unigram, use_cuda=False, lr=1e-1)

for epoch in range(num_epochs):
    # Train
    train_loss_epoch = 0
    for i, (data, labels) in tqdm(enumerate(train_loader), disable=True):
        prediction, loss = model.train_step(data, labels)
        train_loss_epoch += loss
    if epoch % 1 == 0:
        print("Epoch {}: Loss {}".format(epoch, train_loss_epoch))
#         print("prediction ", prediction)

        generated = model.eval_step()
        generated_str = ' '.join([g[0] for g in generated])
        print("Generated Sentence: ", generated_str)
        print("")
        
