<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/neural_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Language Modeling

In [1]:
import os
import sys
sys.path.append('utils/')

In [2]:
import utils.ngram_utils as ngram_utils
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook
from utils.ngram_utils import NgramLM

In [3]:
torch.manual_seed(1)


<torch._C.Generator at 0x7f1fc1974530>

In [4]:
# Read data from .txt files and create lists of reviews

train_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
len(train_data), len(valid_data)

(222919, 27869)

In [6]:
train_data = train_data[:100]
valid_data = valid_data[:10]

In [7]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


100it [00:00, 525.79it/s]
10it [00:00, 1291.83it/s]


In [8]:
train_ngram_lm = NgramLM(train_data_tokenized, all_tokens_train, n=1)
valid_ngram_lm = NgramLM(valid_data_tokenized, all_tokens_valid, n=1)

In [9]:
# TODO: make sure this makes sense for the valid data -- shouldn't we use the same dict for it?
train_unigram = train_ngram_lm.ngram_data
train_id2token_unigram = train_ngram_lm.id2token

valid_unigram = valid_ngram_lm.ngram_data
# valid_id2token_unigram = valid_ngram_lm.id2token 

In [10]:
N = 10  # N-grams

In [11]:
# TODO: make sure this makes sense for the valid data -- shouldn't we use the same dict for it?
train_data_id = train_ngram_lm.create_data_id(train_unigram)
train_data_id_merged = train_ngram_lm.create_data_id_merged(train_data_id, N)

valid_data_id = train_ngram_lm.create_data_id(valid_unigram)
valid_data_id_merged = train_ngram_lm.create_data_id_merged(valid_data_id, N)

In [12]:
# train_data_id[:2], train_data_id_merged[:2]

In [13]:
# len(train_data_id_merged), len(valid_data_id_merged), train_data_id_merged[0]

In [14]:
train_dataset = AmazonDataset(train_data_id_merged, max_inp_length=None, use_cuda=True)
train_loader = DataLoader(train_dataset, batch_size=1048576, collate_fn=batchify, shuffle=True)

valid_dataset = AmazonDataset(valid_data_id_merged, max_inp_length=None, use_cuda=True)
valid_loader = DataLoader(valid_dataset, batch_size=1048576, collate_fn=batchify, shuffle=True)

100%|██████████| 15129/15129 [00:03<00:00, 3903.13it/s]
100%|██████████| 459/459 [00:00<00:00, 8763.39it/s]


In [15]:
# train_dataset, train_loader

In [16]:
# train_data_id_merged[:5]

In [17]:
# train_dataset[0], train_dataset[0][0].shape, \
# valid_dataset[0], valid_dataset[0][0].shape

In [18]:
# N = 10 from above
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
num_epochs = 500
context = True
encoder = BagOfNGrams(len(train_id2token_unigram), emb_dim=300, hidden_size=512, out_size=256, activation='Tanh', nlayers=1, reduce='mean', dropout=0.0, batch_norm=False)
decoder = DecoderMLP(input_size=256, output_size=len(train_id2token_unigram), hidden_size=512)
model = seq2seq(encoder, decoder, train_id2token_unigram, use_cuda=False, lr=1e-1, size_ngrams=N)

for epoch in range(num_epochs):
    # Train
    train_loss_epoch = 0
    for i, (data, labels) in tqdm(enumerate(train_loader), disable=True):
        prediction, loss = model.train_step(data, labels)
        train_loss_epoch += loss
        
    if epoch % 1 == 0:
        print("Epoch {}: Loss {}".format(epoch, train_loss_epoch))
        generated = model.eval_step(data, use_context=context)            # batch predictions
        if context:
            print("Context: ", [model.v2t(d) for d in data][0])  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        print("Generated Sentence: ", generated_str[0])  # print only one generated sentence out of the bsz 
        print("")
        break
        


Epoch 0: Loss 7.605658054351807
[('the',), ('computer',), ('speakers',), ('played',), ('the',), ('audio',), (',',), ('so',), ('why',), ('do',)]
Context:  [('the',), ('computer',), ('speakers',), ('played',), ('the',), ('audio',), (',',), ('so',), ('why',), ('do',)]
Generated Sentence:  fretting weekends future unsuccessfully pass pass pass cds seem seem seem desktop differences shirts shirts desktop italy shirt shirt shirt



In [19]:
Epoch 0: Loss 7.605658054351807
[[('the',), ('computer',), ('speakers',), ('played',), ('the',), ('audio',), (',',), ('so',), ('why',), ('do',)], [('ok',), (',',), ('what',), ('i',), ('don',), ("'",), ('t',), ('like',), ('-',), ('the',)], [('it',), ('is',), ('well',), ('built',), ('and',), ('we',), ('hope',), ('she',), ('gets',), ('lots',)]]
Generated Sentence:  fretting weekends future unsuccessfully pass pass pass cds seem seem seem desktop differences shirts shirts desktop italy shirt shirt shirt


Epoch 0: Loss 7.605658054351807
['<sos>', '<sos>', '<sos>', '<sos>', '<sos>', '<sos>', '<sos>', '<sos>', '<sos>', '<sos>']
Generated Sentence:  13 weekends weekends stash stash boring giving high high high purse purse holding participation finish finish corsican corsican corsican limited


SyntaxError: invalid syntax (<ipython-input-19-844c10f5a5b8>, line 1)

In [None]:
# len(generated), generated[0], generated[1], len(data[:2])

In [None]:
# generated_str, len(generated_str), generated_str[0]