# Amazon review dataset routines

In [55]:
#!pip install stanfordnlp

In [2]:
import torch
import torchtext
import torchtext.data as data
from torchtext import vocab
from collections import Counter
import re
from torchtext.data import TabularDataset 

class AmazonReviewsDataset(TabularDataset):
    
    urls = [
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_CDs_and_Vinyl_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Clothing_Shoes_and_Jewelry_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Home_and_Kitchen_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Kindle_Store_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Health_and_Personal_Care_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Toys_and_Games_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Tools_and_Home_Improvement_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Beauty_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Office_Products_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Pet_Supplies_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Grocery_and_Gourmet_Food_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Patio_Lawn_and_Garden_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Baby_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz',
#            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz',
            'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Amazon_Instant_Video_5.json.gz',
        ]
    name='amazonreviews'
    dirname='processed'

For now we download only one section, check class attribute 'urls' and uncomment if you want more

I think we could jsut choose the most funny section there and it will be enough for us

In [3]:
download_done = AmazonReviewsDataset.download(root='data/', check=True)

Default tokenizer for data.Filed() is string.split()

In [4]:
RETOK = re.compile(r'\w+|[^\w\s]|\n', re.UNICODE)

def tokenize(s):
    return RETOK.findall(s)

text_field = data.Field(sequential=True, tokenize=tokenize, include_lengths=True, use_vocab=True, lower=True, batch_first=True)

In [5]:
dataset = AmazonReviewsDataset(path='../data/amazonreviews/reviews_Amazon_Instant_Video_5.json', format='json', fields={'reviewText': ('reviewText', text_field), 'summary': ('summary', text_field)})

In [123]:
# lets check it
# lets use fstrings btw
print(f'Number of samples : {len(dataset.examples)}')

for ex in dataset.examples:
    print(f'Review: \n {ex.reviewText} \n\n Summary: \n {ex.summary}')
    break

Number of samples : 37126
Review: 
 ['i', 'had', 'big', 'expectations', 'because', 'i', 'love', 'english', 'tv', ',', 'in', 'particular', 'investigative', 'and', 'detective', 'stuff', 'but', 'this', 'guy', 'is', 'really', 'boring', '.', 'it', 'didn', "'", 't', 'appeal', 'to', 'me', 'at', 'all', '.'] 

 Summary: 
 ['a', 'little', 'bit', 'boring', 'for', 'me']


In [124]:
# fasttext vecs are about 6.6G, but I guess google colab has super fast link

In [125]:
out = vocab.FastText(language='en')

In [126]:
# building vocab file

In [127]:
text_field.build_vocab(dataset, max_size=30000, vectors=out)

In [128]:
# making a batch iterator
train_loader = data.BucketIterator(dataset=dataset, batch_size=4, sort_key=lambda x: len(x.reviewText), device=torch.device('cpu'), sort_within_batch=True, repeat=False)

In [129]:
batch = next(iter(train_loader))
print(batch)


[torchtext.data.batch.Batch of size 4 from AMAZONREVIEWS]
	[.reviewText]:('[torch.LongTensor of size 4x34]', '[torch.LongTensor of size 4]')
	[.summary]:('[torch.LongTensor of size 4x9]', '[torch.LongTensor of size 4]')


In [135]:
def _vec2txt(vec):
    return [text_field.vocab.itos[t] for t in vec]

In [136]:
print(batch.reviewText[0][0])
print(_vec2txt(batch.reviewText[0][0]))

tensor([   46,  1923,     8,  2034, 24716,     2,   282,     7,   339,     3,
          454,   101,  1495,   120,    35,   173,   454,   140,    22,    26,
           61,   510,     6,   170,    66,     3,   325,     8,     3,   936,
           14,    13,    39,     2])
['great', 'variety', 'of', 'items', 'pawned', '.', 'hard', 'to', 'believe', 'the', 'money', 'people', 'loose', 'because', 'they', 'want', 'money', 'now', '!', 'you', 'can', 'learn', 'a', 'lot', 'about', 'the', 'history', 'of', 'the', 'south', 'in', 'this', 'one', '.']
