### Reading Data

In [288]:
import ndjson
import numpy as np
import random

In [7]:
with open("data/Tools_and_Home_Improvement_5.json") as f:
    data = ndjson.load(f)

In [56]:
data[:2]

[{'overall': 5.0,
  'verified': True,
  'reviewTime': '01 28, 2018',
  'reviewerID': 'AL19QO4XLBQPU',
  'asin': '0982085028',
  'style': {'Style:': ' 1) IR30 POU (30A/3.4kW/110v)'},
  'reviewerName': 'J. Mollenkamp',
  'reviewText': 'returned, decided against this product',
  'summary': 'Five Stars',
  'unixReviewTime': 1517097600},
 {'overall': 5.0,
  'verified': True,
  'reviewTime': '11 30, 2017',
  'reviewerID': 'A1I7CVB7X3T81E',
  'asin': '0982085028',
  'style': {'Style:': ' 3) IR260 POU (30A/6kW/220v)'},
  'reviewerName': 'warfam',
  'reviewText': 'Awesome heater for the electrical requirements! Makes an awesome preheater for my talnkless system',
  'summary': 'Five Stars',
  'unixReviewTime': 1512000000}]

In [54]:
len(data)

2070831

In [243]:
review_texts = [rev['reviewText'] for rev in data if 'reviewText' in rev.keys()]

In [244]:
len(review_texts)

2070309

In [219]:
review_texts[10:15]

['Good for overflow filter. Works good. White is easy to see how dirty it is. Like that fact. Works better than the course one, for catching finer particles. 100% satisfied. Will buy it again.',
 'Super fast and easy to install.',
 'I used these to hold the solar panel onto the roof of my cargo trailer.  It was a pretty straight forward installation and holds the solar panel in place nicely.',
 'Was a little skeptical at first . a flow heater is a different creature than a tank but after a few lessons on how to get hot water I had to turn the beast down !  If it lasts with out service problems this will be the best buy of the year on Amazon . congrats to the designer',
 'I bought this for my neighbor and he loves it. Came exactly as described, very sturdy.']

In [220]:
' '.join(review_texts[10:15])

'Good for overflow filter. Works good. White is easy to see how dirty it is. Like that fact. Works better than the course one, for catching finer particles. 100% satisfied. Will buy it again. Super fast and easy to install. I used these to hold the solar panel onto the roof of my cargo trailer.  It was a pretty straight forward installation and holds the solar panel in place nicely. Was a little skeptical at first . a flow heater is a different creature than a tank but after a few lessons on how to get hot water I had to turn the beast down !  If it lasts with out service problems this will be the best buy of the year on Amazon . congrats to the designer I bought this for my neighbor and he loves it. Came exactly as described, very sturdy.'

### Text Preprocessing

In [58]:
import re
from collections import Counter

In [62]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [226]:
class TextPreProcessor:
    def __init__(self, text):
        
        self.text = text.lower()
        
        #Replacing punctuations with tokens
        self.text = self.text.replace('.', ' <PERIOD> ')
        self.text = self.text.replace(',', ' <COMMA> ')
        self.text = self.text.replace('"', ' <QUOTATION_MARK> ')
        self.text = self.text.replace(';', ' <SEMICOLON> ')
        self.text = self.text.replace('!', ' <EXCLAMATION_MARK> ')
        self.text = self.text.replace('?', ' <QUESTION_MARK> ')
        self.text = self.text.replace('(', ' <LEFT_PAREN> ')
        self.text = self.text.replace(')', ' <RIGHT_PAREN> ')
        self.text = self.text.replace('--', ' <HYPHENS> ')
        self.text = self.text.replace('?', ' <QUESTION_MARK> ')
        self.text = self.text.replace(':', ' <COLON> ')
        self.text = self.text.replace('%', ' <PERCENTAGE> ')
        self.text = self.text.replace('#', ' <NUMBERSIGN> ')
        
        self.words = self.text.split()
        self.word_counts = Counter(self.words)
    
        
    def refine_words(self, min_occ):
        return [word for word in self.words if self.word_counts[word] >= min_occ]
    
    def lookup_tables(self):
        vocabs = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        
        int_to_vocab = {ii:vv for ii,vv in enumerate(vocabs)} 
        vocab_to_int = {vv:ii for ii,vv in int_to_vocab.items()}
        
        return int_to_vocab, vocab_to_int
        

In [227]:
tp = TextPreProcessor(' '.join(review_texts))
int_to_vocab , vocab_to_int = tp.lookup_tables()

In [232]:
len(int_to_vocab) == len(vocab_to_int)

True

In [234]:
len(int_to_vocab)

400363

In [252]:
processed_reviews = [TextPreProcessor(rev).text.split() for rev in review_texts]
docs = [[vocab_to_int[v] for v in rev] for rev in processed_reviews]

In [254]:
docs[:3]

[[804, 2, 567, 776, 11, 61],
 [362, 779, 10, 1, 722, 3682, 25, 215, 60, 362, 52125, 10, 15, 112873, 380],
 [733, 1, 5437, 9, 72, 308, 1014, 6, 14, 17, 0, 7062, 96, 0]]

### Making and Generating Batches

In [166]:
def get_target(words, idx, max_window_size = 5):
    window_size = np.random.randint(1, max_window_size+1)
    start = idx - window_size if idx>= window_size else 0
    stop = idx + window_size
    targets = words[start:idx] + words[idx+1:stop+1]
    return targets

In [255]:
def get_batches(docs, batch_size = 32, window_size = 5):
   
    for doc in docs:
        n_batch = len(doc) // batch_size
        words = doc[:n_batch*batch_size]

        for batch_id in range(n_batch):
            x,y = [],[]
            start = batch_id*batch_size
            stop = start + batch_size
            window = words[start:stop]
            for idx in range(batch_size):
                targets = get_target(window, idx, max_window_size=window_size)
                y.extend(targets)
                x.extend([window[idx]]*len(targets))
            yield x,y

In [283]:
int_text = [[i for i in range(20)],[j for j in range(30,40)]]

In [280]:
batch_generator = get_batches(int_text, batch_size=6, window_size=5)

In [287]:
for x,y in get_batches(int_text, batch_size=6, window_size=5):
    print('x\n',x)
    print('y\n',y)

#while(next(batch_generator, None)):
#    print('x\n', x)
#    print('y\n', y)

x
 [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5]
y
 [1, 2, 3, 4, 0, 2, 3, 4, 5, 1, 3, 0, 1, 2, 4, 5, 2, 3, 5, 1, 2, 3, 4]
x
 [6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 11, 11, 11]
y
 [7, 6, 8, 9, 10, 11, 6, 7, 9, 10, 6, 7, 8, 10, 11, 9, 11, 8, 9, 10]
x
 [12, 12, 12, 13, 13, 14, 14, 14, 14, 14, 15, 15, 16, 16, 17, 17]
y
 [13, 14, 15, 12, 14, 12, 13, 15, 16, 17, 14, 16, 15, 17, 15, 16]
x
 [30, 30, 31, 31, 31, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35]
y
 [31, 32, 30, 32, 33, 31, 33, 30, 31, 32, 34, 35, 33, 35, 33, 34]


### Subsampling

P(W(i)) = 1 - sqrt(t/f(w(i)))

In [278]:
from collections import Counter
import numpy as np

def get_train_words(words, tr=1e-5):
    words_count = Counter(words)
    words_freq = {w:words_count[w]/len(words) for w in words}
    discard_prob = {w:1-np.sqrt(tr/words_freq[w])}
    
    return [word for word in words if np.random.random() > discard_prob[word]]
    

### Word-Embedding Model

In [216]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

In [218]:
class Word2VevModel(nn.Module):
    def __init__(self, n_vocab, emvec_size):
        super().__init__()
        self.n_vocab = n_vocab
        self.emvec_size = emvec_size
        
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=emvec_size)
        self.outLayer = nn.Linear(emvec_size, n_vocab)
        self.softmax = F.log_softmax(dim=1)
        
    def forward(self,x):
        emVectors = self.embedding(x)
        scores = self.outLayer(emVectors)
        log_probs = self.softmax(scores)
        
        return log_probs
        

### Validation

In [297]:
def cosine_similarity(embedding, val_size=32, cuda_available = False):
    embed_vectors = embedding.weight
    
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt()
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    val_examples = np.array(random.sample(range(100), val_size//2))
    val_examples = np.append(val_examples, np.array(random.sample(1000,1100), val_size//2))
    
    val_examples = torch.LongTensor(val_examples)
    
    if cuda_available:
        val_examples =  val_examples.cuda()
    
    val_vectors = embedding(val_examples)
    
    similarities = torch.mm(valid_vectors, embed_vectors.t()) / magnitudes
    
    return val_examples, similarities