<a href="https://colab.research.google.com/github/ivyclare/Project-50_Projects_In_Deep_Learning/blob/master/MovieReviewsWithRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

import torch
import torch.nn.functional as F
from torchtext import datasets
from torchtext import data
import torch.optim as optim
from torch import nn,optim
import torch.nn.functional as F
from torch.utils.data import *

import random
import matplotlib.pyplot as plt
%matplotlib inline

import time
import json
import copy
import os
import glob

from PIL import Image

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  device = torch.device('cpu') 
  print('CUDA is not available.  Training on CPU ...')
else:
  device = torch.device('cuda')
  print('CUDA is available!  Training on GPU ...')
    
device


CUDA is available!  Training on GPU ...


device(type='cuda')

In [0]:
# set up fields
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [0]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print('Number of training examples:', len(train_data))
print('Number of testing examples:', len(test_data))

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 21.9MB/s]


Number of training examples: 25000
Number of testing examples: 25000


In [0]:
print (vars(train_data.examples[2]))


{'text': ['This', 'is', 'still', 'the', 'benchmark', 'to', 'judge', 'all', 'Golden', 'Age', 'whodunnits', 'by', ',', 'and', 'taking', 'into', 'account', 'the', 'limited', 'technology', 'and', 'dubious', 'ethical', 'standards', 'of', 'the', 'authorities', '(', 'on', 'screen', ')', 'bears', 'up', 'well', 'against', 'all', 'generations', 'of', 'similar', 'attempts', 'since', 'on', 'film', 'and', 'TV', '.', 'Fast', 'and', 'furious', 'with', 'plenty', 'of', 'Warner', 'Bros', 'wipes', ',', 'and', 'thankfully', 'no', 'time', 'for', 'a', 'love', 'interest', 'it', 'gallops', 'along', ',', 'taking', 'the', 'splendid', 'cast', 'with', 'it', 'to', 'the', 'violent', 'end', '.', 'I', 'never', 'understood', 'why', 'the', 'DA', 'had', 'to', 'trail', 'Vance', 'around', 'everywhere', ',', 'I', 'always', 'thought', 'they', 'were', 'deskbound', '.', 'Palette', 'as', 'the', 'detective', 'but', 'especially', 'Girardot', 'as', 'the', 'doctor', 'are', 'delightfully', 'eccentric', 'and', 'un', '-', 'PC', '-', 

In [0]:
train_data, val_data = train_data.split(split_ratio=0.8)
print('Number of training examples:', len(train_data))
print('Number of validation examples:', len(val_data))


Number of training examples: 20000
Number of validation examples: 5000


### Glove

In [0]:
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [00:22, 38.9MB/s]                           
100%|█████████▉| 399495/400000 [00:23<00:00, 17055.76it/s]

In [0]:
print('Unique tokens in TEXT vocabulary:', len(TEXT.vocab))
print('Unique tokens in LABEL vocabulary:',len(LABEL.vocab))

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [0]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [0]:
print(TEXT.vocab.freqs.most_common(20))


[('the', 231074), (',', 220101), ('.', 188225), ('and', 125208), ('a', 124540), ('of', 115232), ('to', 106672), ('is', 86890), ('in', 70193), ('I', 61533), ('it', 60824), ('that', 55987), ('"', 50261), ("'s", 49451), ('this', 48132), ('-', 42522), ('/><br', 40805), ('was', 39925), ('as', 34669), ('with', 34123)]


In [0]:
BATCH_SIZE = 64

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [0]:
class LSTM_Model(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size):
      
      super(LSTM_Model, self).__init__()
      self.num_layers = 1
      self.batch_size = batch_size
      self.hidden_dim = hidden_dim
      
      self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) 
      # The LSTM takes word embeddings as inputs, and outputs hidden states
      # with dimensionality hidden_dim.
      self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=self.num_layers) 
      self.fc = nn.Linear(hidden_dim, 1)
      self.hidden = self.init_hidden()      
      
    def forward(self, sentence):
      
        embeds = self.word_embeddings(sentence)
        # [sent_len, batch_size] --> [sent_len, batch_size, emb_dim]
        lstm_out, self.hidden = self.lstm(embeds, self.hidden) 
        # [sent_len, batch_size, emb_dim] --> [seq_len, batch, num_directions*hidden_size]
        (hidden, cell) =  self.hidden
        preds = self.fc(lstm_out[-1].squeeze(0))
        # [batch, num_directions*hidden_size] --> [batch_size, 1]
        return preds     
      
    def init_hidden(self):
      # Before we've done anything, we dont have any hidden state.
      # The axes semantics are (num_layers, minibatch_size, hidden_dim)
      return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).to(device),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).to(device))

In [0]:
model = LSTM_Model(vocab_size=len(TEXT.vocab), embedding_dim=300, hidden_dim=128, batch_size=BATCH_SIZE)
model.to(device)

100%|█████████▉| 399495/400000 [00:40<00:00, 17055.76it/s]

LSTM_Model(
  (word_embeddings): Embedding(25002, 300)
  (lstm): LSTM(300, 128)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [0]:
optimizer = optim.Adam(model.parameters(), lr=1e-3, amsgrad=True)
criterion = nn.BCEWithLogitsLoss()

In [0]:
# Create training and validation dataloaders
dataloaders_dict = {'train': train_iterator, 
                    'val': val_iterator}

In [0]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs, batch_size=BATCH_SIZE):
    since = time.time()

    history = dict()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    skip_count = 0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            

            # Iterate over data.
            for data in dataloaders[phase]:
                inputs, labels = data.text, data.label
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        # we need to clear out the hidden state of the LSTM,
                        # detaching it from its history on the last instance.
                        model.batch_size = inputs.shape[1]
                        model.hidden = model.init_hidden()
                        
                        outputs = model(inputs).squeeze(1)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()
                        
                    else:
                        model.batch_size = inputs.shape[1]
                        model.hidden = model.init_hidden()
                        outputs = model(inputs).squeeze(1)
                        loss = criterion(outputs, labels)


                # statistics
                running_loss += loss.item()
                outputs = torch.round(torch.sigmoid(outputs))
                corrects = (outputs == labels).float()
                acc = corrects.sum()/len(corrects)
                running_corrects += acc.item()

            epoch_loss = running_loss / len(dataloaders[phase])
            epoch_acc = running_corrects / len(dataloaders[phase])

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            
            if phase+'_acc' in history:
                # append the new number to the existing array at this slot 
                                   history[phase+'_acc'].append(epoch_acc)
            else:
                # create a new array in this slot
                history[phase+'_acc'] = [epoch_acc]
            
            if phase+'_loss' in history:
                # append the new number to the existing array at this slot
                history[phase+'_loss'].append(epoch_loss)
            else:
                # create a new array in this slot
                history[phase+'_loss'] = [epoch_loss]            

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, history
                                   

In [0]:
model, history = train_model(model, dataloaders_dict, criterion, optimizer, num_epochs=10)

Epoch 0/9
----------
train Loss: 0.6947 Acc: 0.5007
val Loss: 0.6941 Acc: 0.4792
Epoch 1/9
----------
train Loss: 0.6930 Acc: 0.4963
val Loss: 0.6986 Acc: 0.4939
Epoch 2/9
----------
train Loss: 0.6916 Acc: 0.5047
val Loss: 0.6917 Acc: 0.5180
Epoch 3/9
----------
train Loss: 0.6894 Acc: 0.5041
val Loss: 0.6989 Acc: 0.5239
Epoch 4/9
----------
train Loss: 0.6884 Acc: 0.4971
val Loss: 0.7014 Acc: 0.5275
Epoch 5/9
----------
train Loss: 0.6870 Acc: 0.5094
val Loss: 0.7044 Acc: 0.5281
Epoch 6/9
----------
train Loss: 0.6870 Acc: 0.5118
val Loss: 0.7080 Acc: 0.5105
Epoch 7/9
----------
train Loss: 0.6866 Acc: 0.5051
val Loss: 0.7067 Acc: 0.5487
Epoch 8/9
----------
train Loss: 0.6858 Acc: 0.5001
val Loss: 0.7183 Acc: 0.5336
Epoch 9/9
----------
train Loss: 0.6793 Acc: 0.5338
val Loss: 0.6679 Acc: 0.6230
Training complete in 14m 3s
Best val Acc: 0.623022


In [0]:
# summarize history for accuracy
plt.plot(history['train_acc'])
plt.plot(history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history['train_loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()