### Reading Data

In [1]:
import ndjson
import numpy as np
import random

In [2]:
with open("data/Tools_and_Home_Improvement_5.json") as f:
    data = ndjson.load(f)

In [3]:
data[:2]

[{'overall': 5.0,
  'verified': True,
  'reviewTime': '01 28, 2018',
  'reviewerID': 'AL19QO4XLBQPU',
  'asin': '0982085028',
  'style': {'Style:': ' 1) IR30 POU (30A/3.4kW/110v)'},
  'reviewerName': 'J. Mollenkamp',
  'reviewText': 'returned, decided against this product',
  'summary': 'Five Stars',
  'unixReviewTime': 1517097600},
 {'overall': 5.0,
  'verified': True,
  'reviewTime': '11 30, 2017',
  'reviewerID': 'A1I7CVB7X3T81E',
  'asin': '0982085028',
  'style': {'Style:': ' 3) IR260 POU (30A/6kW/220v)'},
  'reviewerName': 'warfam',
  'reviewText': 'Awesome heater for the electrical requirements! Makes an awesome preheater for my talnkless system',
  'summary': 'Five Stars',
  'unixReviewTime': 1512000000}]

In [4]:
len(data)

2070831

In [5]:
review_texts = [rev['reviewText'] for rev in data if 'reviewText' in rev.keys()]

In [6]:
len(review_texts)

2070309

In [7]:
review_texts[10:15]

['Good for overflow filter. Works good. White is easy to see how dirty it is. Like that fact. Works better than the course one, for catching finer particles. 100% satisfied. Will buy it again.',
 'Super fast and easy to install.',
 'I used these to hold the solar panel onto the roof of my cargo trailer.  It was a pretty straight forward installation and holds the solar panel in place nicely.',
 'Was a little skeptical at first . a flow heater is a different creature than a tank but after a few lessons on how to get hot water I had to turn the beast down !  If it lasts with out service problems this will be the best buy of the year on Amazon . congrats to the designer',
 'I bought this for my neighbor and he loves it. Came exactly as described, very sturdy.']

In [8]:
' '.join(review_texts[10:15])

'Good for overflow filter. Works good. White is easy to see how dirty it is. Like that fact. Works better than the course one, for catching finer particles. 100% satisfied. Will buy it again. Super fast and easy to install. I used these to hold the solar panel onto the roof of my cargo trailer.  It was a pretty straight forward installation and holds the solar panel in place nicely. Was a little skeptical at first . a flow heater is a different creature than a tank but after a few lessons on how to get hot water I had to turn the beast down !  If it lasts with out service problems this will be the best buy of the year on Amazon . congrats to the designer I bought this for my neighbor and he loves it. Came exactly as described, very sturdy.'

### Text Preprocessing

In [9]:
import re
from collections import Counter

In [10]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [11]:
class TextPreProcessor:
    def __init__(self, text, min_occ=1):
        
        self.text = text.lower()
        self.min_occ = min_occ
        
        #Replacing punctuations with tokens
        self.text = self.text.replace('.', ' <PERIOD> ')
        self.text = self.text.replace(',', ' <COMMA> ')
        self.text = self.text.replace('"', ' <QUOTATION_MARK> ')
        self.text = self.text.replace(';', ' <SEMICOLON> ')
        self.text = self.text.replace('!', ' <EXCLAMATION_MARK> ')
        self.text = self.text.replace('?', ' <QUESTION_MARK> ')
        self.text = self.text.replace('(', ' <LEFT_PAREN> ')
        self.text = self.text.replace(')', ' <RIGHT_PAREN> ')
        self.text = self.text.replace('--', ' <HYPHENS> ')
        self.text = self.text.replace('?', ' <QUESTION_MARK> ')
        self.text = self.text.replace(':', ' <COLON> ')
        self.text = self.text.replace('%', ' <PERCENTAGE> ')
        self.text = self.text.replace('#', ' <NUMBERSIGN> ')
        
        self.words = self.text.split()
        self.word_counts = Counter(self.words)
        self.words = [word for word in self.words if self.word_counts[word] >= self.min_occ]
    
        
    
    def lookup_tables(self):
        vocabs = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        
        vocabs = [word for word in vocabs if self.word_counts[word] >= self.min_occ]
        
        int_to_vocab = {ii:vv for ii,vv in enumerate(vocabs)} 
        vocab_to_int = {vv:ii for ii,vv in int_to_vocab.items()}
        
        return int_to_vocab, vocab_to_int
        

In [13]:
tp = TextPreProcessor(' '.join(review_texts), 5)
int_to_vocab , vocab_to_int = tp.lookup_tables()
valid_words = [vocab_to_int[word] for word in tp.words] 

In [14]:
len(int_to_vocab) == len(vocab_to_int)

True

In [15]:
len(int_to_vocab)

76278

In [None]:
processed_reviews = [TextPreProcessor(rev).text.split() for rev in review_texts]
docs = [[vocab_to_int[v] for v in rev if v in vocab_to_int] for rev in processed_reviews]

In [None]:
docs[:3]

### Subsampling

In [None]:
from collections import Counter
import numpy as np

def get_discard_prob(words, tr=1e-5):
    words_count = Counter(words)
    words_freq = {w:words_count[w]/len(words) for w in words}
    discard_prob = {w:1-np.sqrt(tr/words_freq[w]) for w in words}
    
    return discard_prob
    

In [None]:
discard_prob = get_discard_prob(valid_words)

In [None]:
len(discard_prob)

### Making and Generating Batchevalid_words

In [None]:
def get_target(words, idx, max_window_size = 5):
    window_size = np.random.randint(1, max_window_size+1)
    start = idx - window_size if idx>= window_size else 0
    stop = idx + window_size
    targets = words[start:idx] + words[idx+1:stop+1]
    return targets

In [None]:
def get_batches(docs, batch_size = 32, window_size = 5):
   
    for doc in docs:
        doc = [word for word in doc if np.random.random() > discard_prob[word]]
        n_batch = len(doc) // batch_size
        words = doc[:n_batch*batch_size]

        for batch_id in range(n_batch):
            x,y = [],[]
            start = batch_id*batch_size
            stop = start + batch_size
            window = words[start:stop]
            for idx in range(batch_size):
                targets = get_target(window, idx, max_window_size=window_size)
                y.extend(targets)
                x.extend([window[idx]]*len(targets))
            yield x,y

In [None]:
int_text = [[i for i in range(20)],[j for j in range(30,40)]]

In [None]:
batch_generator = get_batches(int_text, batch_size=6, window_size=5)

In [None]:
#for x,y in get_batches(int_text, batch_size=6, window_size=5):
#    print('x\n',x)
#    print('y\n',y)

#while(next(batch_generator, None)):
#    print('x\n', x)
#    print('y\n', y)

### Word-Embedding Model

In [None]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

In [None]:
class Word2VevModel(nn.Module):
    def __init__(self, n_vocab, emvec_size):
        super().__init__()
        self.n_vocab = n_vocab
        self.emvec_size = emvec_size
        
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=emvec_size)
        self.outLayer = nn.Linear(emvec_size, n_vocab)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self,x):
        emVectors = self.embedding(x)
        scores = self.outLayer(emVectors)
        log_probs = self.softmax(scores)
        
        return log_probs
        

### Validation

In [None]:
def cosine_similarity(embedding, val_size=32, device='cpu'):
    embed_vectors = embedding.weight
    
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt()
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    val_examples = np.array(random.sample(range(100), val_size//2))
    val_examples = np.append(val_examples, np.array(random.sample(range(1000,1100), val_size//2)))
    
    val_examples = torch.LongTensor(val_examples).to(device)
    
    val_vectors = embedding(val_examples)
    
    similarities = torch.mm(val_vectors, embed_vectors.t()) / magnitudes
    
    return val_examples, similarities

### Training

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
emsize = 300
epochs = 50
n_vocab = len(int_to_vocab)
model = Word2VevModel(n_vocab=n_vocab, emvec_size=emsize).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
print(model)

In [None]:
print_every = 500
step = 0

In [None]:
for epoch in range(epochs):
    
    for inputs, targets in get_batches(docs=docs, batch_size=32, window_size=4):
        step+=1
        inputs, targets = torch.LongTensor(inputs).to(device), torch.LongTensor(targets).to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % print_every == 0:
            val_examples , similarities = cosine_similarity(model.embedding, device=device)
            _,topids = similarities.topk(5)
            val_examples, topids = val_examples.to('cpu'), topids.to('cpu')
            for idx, ex in enumerate(val_examples):
                #print(idx)
                #print([x.item() for x in topids[idx]])
                words = [int_to_vocab[ii] for ii in [x.item() for x in topids[idx]]]
                #print(words)
                print(int_to_vocab[ex.item()] + " | " + ' '.join(words) + '\n')
            print("******")