# Amazon Reviews using Pytorch and TorchText

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import torchtext
from torchtext import data
from torch import optim
import torch
from torch import nn
import spacy
%matplotlib inline

In [2]:
df = pd.read_csv('/home/jupyter/work/data/Reviews_Sentiment.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,I have bought several of the Vitality canned d...,1
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,1
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,1


In [3]:
df['Sentiment'].value_counts()

1    443777
0    124677
Name: Sentiment, dtype: int64

Uncomment the below step if you don't have spacy's model downloaded

In [4]:
#!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 100.4MB/s ta 0:00:01
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.0.0

[93m    Linking successful[0m
    /opt/anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
    /opt/anaconda3/lib/python3.7/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [4]:
# create a tokenizer function
spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False, is_target=True)

In [6]:
train = torchtext.data.TabularDataset(path='/home/jupyter/work/data/Reviews_Sentiment.csv', format='csv',
                                      fields=[('Text', TEXT), ('Sentiment', LABEL)])

In [7]:
len(train)

568455

In [8]:
train[0].__dict__.keys()

dict_keys(['Text', 'Sentiment'])

In [9]:
ex = train[10]
type(ex)

torchtext.data.example.Example

In [10]:
print(train[10].__dict__.keys())
print(train[10].Text[:10])

dict_keys(['Text', 'Sentiment'])
['this', 'is', 'a', 'very', 'healthy', 'dog', 'food', '.', 'good', 'for']


Uncomment the below 2 steps if you dont have Glove word embeddings

In [12]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

--2019-04-03 14:39:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-04-03 14:39:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-04-03 14:40:45 (9.49 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [13]:
#!unzip /home/jupyter/work/data/glove.6B.zip

Archive:  /home/jupyter/work/data/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [11]:
TEXT.build_vocab(train, min_freq=3)
TEXT.vocab.load_vectors(torchtext.vocab.Vectors('/home/jupyter/work/data/glove.6B.300d.txt'))
print(TEXT.vocab.vectors.shape)

torch.Size([79337, 300])


In [12]:
TEXT.vocab.vectors[TEXT.vocab.stoi['food']]

tensor([ 3.8544e-01,  3.4247e-01,  2.9599e-01, -2.6200e-01,  3.7383e-02,
         4.5544e-01,  4.9097e-01,  1.1481e-01, -1.1437e-01, -1.9067e+00,
         3.5563e-02, -1.1094e+00, -2.6512e-01,  6.4418e-01, -3.1008e-02,
        -3.5130e-01, -1.0547e-03,  7.4658e-02, -3.0369e-01, -2.8188e-01,
        -3.4342e-01,  3.6205e-01,  7.1009e-01,  3.0243e-01,  7.0325e-02,
         2.9492e-01, -1.6233e-01,  3.0998e-01,  1.3705e-01,  1.1847e-01,
        -6.8642e-01,  4.3305e-01, -6.1518e-01,  2.3643e-01, -8.4174e-01,
         1.4667e-01, -9.6616e-02, -2.0908e-01, -4.2296e-01, -2.7254e-01,
        -7.9343e-01, -6.2781e-01,  6.4804e-01,  1.1541e-01, -3.3486e-01,
        -1.4101e-01,  1.2864e-01, -2.5123e-01, -2.6515e-01,  3.0876e-01,
        -6.3111e-02,  1.7893e-01,  4.1197e-01,  1.9621e-02, -1.5406e-01,
         1.7542e-01,  3.9268e-01,  8.8817e-02,  1.8012e-02, -2.2508e-01,
        -3.1832e-01,  2.2296e-02,  5.9453e-01,  5.6538e-02, -7.2464e-01,
        -3.1751e-01, -3.8650e-01,  3.3806e-01, -1.6

In [31]:
random.seed(123)
train, val = train.split(split_ratio=0.9, random_state=random.getstate())

In [14]:
class BiLSTM(nn.Module):
    def __init__(self, pretrained_lm, padding_idx, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(pretrained_lm)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)
    
    def forward(self, sents):
        x = self.embedding(sents)
        x = torch.transpose(x, dim0=1, dim1=0)
        lstm_out, (h_n, c_n) = self.lstm(x)
        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
        return y

In [39]:
def training(epoch, model, eval_every, loss_func, optimizer, train_iter, val_iter, early_stop=1, warmup_epoch=2):
    
    step = 0
    max_loss = 1e5
    no_improve_epoch = 0
    no_improve_in_previous_epoch = False
    fine_tuning = False
    train_record = []
    val_record = []
    losses = []
    for e in range(epoch):
        train_iter.init_epoch()
        for train_batch in iter(train_iter):
            step += 1
            model.train()
            x = train_batch.Text.cuda()
            y = train_batch.Sentiment.type(torch.Tensor).cuda()
            #print(y.cpu().data.numpy())
            model.zero_grad()
            pred = model.forward(x).view(-1)
            loss = loss_function(pred, y)
            losses.append(loss.cpu().data.numpy())
            train_record.append(loss.cpu().data.numpy())
            loss.backward()
            optimizer.step()
            if step % eval_every == 0:
                model.eval()
                model.zero_grad()
                val_loss = []
                val_accuracy = []
                for val_batch in iter(val_iter):
                    val_x = val_batch.Text.cuda()
                    val_y = val_batch.Sentiment.type(torch.Tensor).cuda()
                    val_pred = model.forward(val_x).view(-1)
                    #m = nn.Sigmoid()
                    #print('val_pred ', val_pred)
                    #val_pred_sigmoid = m(val_pred)
                    #print('val_pred sigmoid ', val_pred_sigmoid)
                    #output = (val_pred_sigmoid > 0.5 ).float()
                    #correct = (output == val_y).float().sum()
                    val_loss.append(loss_function(val_pred, val_y).cpu().data.numpy())
                    #val_accuracy.append(correct.cpu().data.numpy())
                val_record.append({'step': step, 'loss': np.mean(val_loss)})
                print('epoch {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f}'.format(
                            e, step, np.mean(losses), val_record[-1]['loss']))

In [32]:
batch_size = 64
train_iter = torchtext.data.BucketIterator(dataset=train,
                                               batch_size=batch_size,
                                               sort_key=lambda x: x.TEXT.__len__(),
                                               shuffle=True,
                                               sort=False)
val_iter = torchtext.data.BucketIterator(dataset=val,
                                             batch_size=batch_size,
                                             sort_key=lambda x: x.TEXT.__len__(),
                                             train=False,
                                             sort=False)

In [17]:
len(train_iter), len(val_iter)

(7994, 889)

In [18]:
batch = next(iter(train_iter)) # BucketIterator return a batch object
print(type(batch))

<class 'torchtext.data.batch.Batch'>


In [19]:
print(batch.Sentiment.shape)

torch.Size([64])


In [20]:
print(batch.Text.shape)

torch.Size([64, 310])


In [21]:
valid_batch = next(iter(val_iter)) # BucketIterator return a batch object
print(type(valid_batch))

<class 'torchtext.data.batch.Batch'>


In [22]:
print(valid_batch.Sentiment)

tensor([1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1])


In [23]:
print(valid_batch.Text.shape)

torch.Size([64, 735])


In [24]:
model = BiLSTM(TEXT.vocab.vectors, lstm_layer=1, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], hidden_dim=128).cuda()

  "num_layers={}".format(dropout, num_layers))


In [25]:
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                    lr=1e-3)

In [40]:
training(model=model, epoch=2, eval_every=1000,
         loss_func=loss_function, optimizer=optimizer, train_iter=train_iter,
        val_iter=val_iter)

epoch 00 - step 001000 - train_loss 0.1709 - val_loss 0.1691
epoch 00 - step 002000 - train_loss 0.1686 - val_loss 0.1664
epoch 00 - step 003000 - train_loss 0.1678 - val_loss 0.1631
epoch 00 - step 004000 - train_loss 0.1662 - val_loss 0.1603


ValueError: invalid literal for int() with base 10: 'Sentiment'