In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set default to run on the GPU if available (for the speed up)
if torch.cuda.is_available():
    torch.cuda.set_device(device)
torch.set_default_tensor_type('torch.cuda.FloatTensor')


from torch.utils.data import Dataset, DataLoader, random_split

As a first step we prepare the data. We will do this with the help of keras (because it has done a lot of the heavy lifting for us) 

In [2]:
import tensorflow.keras.datasets.imdb as imdb

In [3]:
### we use a dataset, but we cheat a little by using keras
import torch

maxlen = 250
num_words = 2500
class ImdbDatasetTrain(Dataset):
    
    '''
    Datasets needs an __init__, __get_item__ and __len__
    '''
    
    def __init__(self):
        self.samples = []
        (x_train, y_train), (x_test, y_test) = imdb.load_data(skip_top=10, num_words=num_words, oov_char=2, maxlen=maxlen)

        x_train = [x + [0] * (maxlen- len(x)) for x in x_train]
        
        
        for x, y in zip(x_train, y_train):
            self.samples.append((torch.LongTensor(x).to(device), y))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    

In [4]:
dataset = ImdbDatasetTrain()
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, )

In [5]:
for i, (review, cat) in enumerate(dataloader):
    print(review)

tensor([[  2,  60, 151,  ...,   0,   0,   0],
        [  2,   2, 868,  ...,   0,   0,   0],
        [  2,  61,   2,  ...,   0,   0,   0],
        ...,
        [  2,  14,   2,  ...,   0,   0,   0],
        [  2,  13,  28,  ...,   0,   0,   0],
        [  2,   2,  20,  ...,   0,   0,   0]])
tensor([[   2,  207,  110,  ...,    0,    0,    0],
        [   2,   39,    2,  ...,    0,    0,    0],
        [   2,  449,   61,  ...,    0,    0,    0],
        ...,
        [   2,   13,   16,  ...,    0,    0,    0],
        [   2,    2, 1080,  ...,    0,    0,    0],
        [   2,    2,    2,  ...,    0,    0,    0]])
tensor([[  2,  14,  20,  ...,   0,   0,   0],
        [  2,   2, 711,  ...,   0,   0,   0],
        [  2,  14,  16,  ...,   0,   0,   0],
        ...,
        [  2,   2, 687,  ...,   0,   0,   0],
        [  2,  45, 210,  ...,   0,   0,   0],
        [  2,  19,  14,  ...,   0,   0,   0]])
tensor([[  2,  13, 115,  ...,   0,   0,   0],
        [  2,  14,  20,  ...,   0,   0,   0],
  

        [   2, 1238,   24,  ...,    0,    0,    0]])
tensor([[   2,    2,  612,  ...,    0,    0,    0],
        [   2, 2476, 1782,  ...,    0,    0,    0],
        [   2,   14,    2,  ...,    0,    0,    0],
        ...,
        [   2,   14,   20,  ...,    0,    0,    0],
        [   2,    2,    2,  ...,    0,    0,    0],
        [   2, 1318,    2,  ...,    0,    0,    0]])
tensor([[   2,   12,   16,  ...,    0,    0,    0],
        [   2,   14,   20,  ...,    0,    0,    0],
        [   2,   13, 2083,  ...,    0,    0,    0],
        ...,
        [   2,   13,   28,  ...,    0,    0,    0],
        [   2,   14, 2158,  ...,    0,    0,    0],
        [   2,    2,    2,  ...,    0,    0,    0]])
tensor([[  2,   2,   2,  ...,   0,   0,   0],
        [  2,  14,  20,  ...,   0,   0,   0],
        [  2,   2,   2,  ...,   0,   0,   0],
        ...,
        [  2,  14, 218,  ...,   0,   0,   0],
        [  2,  14,   2,  ...,   0,   0,   0],
        [  2,   2,   2,  ...,   0,   0,   0]])
tenso

        [   2,  146,    2,  ...,    0,    0,    0]])
tensor([[   2, 1064,    2,  ...,    0,    0,    0],
        [   2,   17,    2,  ...,    0,    0,    0],
        [   2,  133,  266,  ...,    0,    0,    0],
        ...,
        [   2, 1038,   45,  ...,    0,    0,    0],
        [   2,   14,   20,  ...,    0,    0,    0],
        [   2,   14,   86,  ...,    0,    0,    0]])
tensor([[   2,    2,    2,  ...,    0,    0,    0],
        [   2,  308,    2,  ...,    0,    0,    0],
        [   2,  988, 2069,  ...,    0,    0,    0],
        ...,
        [   2,   13, 1610,  ...,    0,    0,    0],
        [   2,    2,    2,  ...,    0,    0,    0],
        [   2,  207,  110,  ...,    0,    0,    0]])
tensor([[   2,   13,  377,  ...,    0,    0,    0],
        [   2,    2,    2,  ...,    0,    0,    0],
        [   2,    2,  427,  ...,    0,    0,    0],
        ...,
        [   2,   12,  203,  ...,    0,    0,    0],
        [   2,   81,   25,  ...,    0,    0,    0],
        [   2,    2, 1

        [   2,   13,   92,  ...,    0,    0,    0]])
tensor([[   2,    2, 1171,  ...,    0,    0,    0],
        [   2,  237,   14,  ...,    0,    0,    0],
        [   2,  869,    2,  ...,    0,    0,    0],
        ...,
        [   2,    2,   64,  ...,    0,    0,    0],
        [   2,   13,   92,  ...,    0,    0,    0],
        [   2,  137,   13,  ...,    0,    0,    0]])
tensor([[   2,    2,  108,  ...,    0,    0,    0],
        [   2, 1758,    2,  ...,    0,    0,    0],
        [   2,    2, 1180,  ...,    0,    0,    0],
        ...,
        [   2, 1977,    2,  ...,    0,    0,    0],
        [   2,   13,  219,  ...,    0,    0,    0],
        [   2,   13,   92,  ...,    0,    0,    0]])
tensor([[ 2, 13, 43,  ...,  0,  0,  0],
        [ 2, 14,  2,  ...,  0,  0,  0],
        [ 2, 54, 13,  ...,  0,  0,  0],
        ...,
        [ 2, 51, 16,  ...,  0,  0,  0],
        [ 2, 13, 28,  ...,  0,  0,  0],
        [ 2,  2,  2,  ...,  0,  0,  0]])
tensor([[  2,  13, 165,  ...,   0,   0,  

        [  2,  48,  25,  ...,   0,   0,   0]])
tensor([[   2,   13,   62,  ...,    0,    0,    0],
        [   2,   35,    2,  ...,   31,    0,    0],
        [   2,   18,   49,  ...,    0,    0,    0],
        ...,
        [   2,    2,  392,  ...,    0,    0,    0],
        [   2, 1722,    2,  ...,    0,    0,    0],
        [   2,    2, 2499,  ...,    0,    0,    0]])
tensor([[   2,   31,    2,  ...,    0,    0,    0],
        [   2, 1490, 2350,  ...,    0,    0,    0],
        [   2,   48,    2,  ...,    0,    0,    0],
        ...,
        [   2,   13,  197,  ...,    0,    0,    0],
        [   2,   14,   20,  ...,    0,    0,    0],
        [   2,   13,  244,  ...,    0,    0,    0]])
tensor([[  2, 280,  54,  ...,   0,   0,   0],
        [  2,  50,   2,  ...,   0,   0,   0],
        [  2,  14, 509,  ...,   0,   0,   0],
        ...,
        [  2,  13, 219,  ...,   0,   0,   0],
        [  2, 207, 332,  ...,   0,   0,   0],
        [  2,  35,   2,  ...,   0,   0,   0]])
tensor([[  

        [  2,  14,  16,  ...,   0,   0,   0]])
tensor([[   2,  670, 1848,  ...,    0,    0,    0],
        [   2,   23,    2,  ...,    0,    0,    0],
        [   2,    2,    2,  ...,    0,    0,    0],
        ...,
        [   2,   13,   43,  ...,    0,    0,    0],
        [   2,   13, 1033,  ...,    0,    0,    0],
        [   2,   13,  520,  ...,    0,    0,    0]])
tensor([[   2,  308,    2,  ...,    0,    0,    0],
        [   2,  488,    2,  ...,    0,    0,    0],
        [   2,    2,  235,  ...,    0,    0,    0],
        ...,
        [   2,   92, 1632,  ...,    0,    0,    0],
        [   2,   13,   92,  ...,    0,    0,    0],
        [   2,   13,  258,  ...,    0,    0,    0]])
tensor([[   2,  153,  596,  ...,    0,    0,    0],
        [   2,  314,    2,  ...,    0,    0,    0],
        [   2,   25,  104,  ...,    0,    0,    0],
        ...,
        [   2,   61,  931,  ...,    0,    0,    0],
        [   2, 1402,   14,  ...,    0,    0,    0],
        [   2,    2,  486,  

In [6]:
#Next we need to create 

In [7]:
import torch.nn as nn 
from torch.nn import functional as F
n_hidden = 128


class LSTMModel(nn.Module):

    def __init__(self, num_words, n_hidden):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_words, n_hidden)
        self.LSTM = nn.LSTM(n_hidden, n_hidden, batch_first=True,)
        self.fc = nn.Linear(n_hidden, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, review):
        
        x = self.embedding(review)
        
        rnn_out, _ = self.LSTM(x)
     
        fc_out = self.fc(rnn_out[:, -1 ]) # Only need the last output of the rnn
        
        return fc_out
        
    
model = LSTMModel(num_words + 2, n_hidden)
model.to(device)

LSTMModel(
  (embedding): Embedding(2502, 128)
  (LSTM): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [8]:
import torch
lr=0.01

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
n_epochs = 50

In [9]:
loss_history = []
test_history = []
acc_history = []
n_epochs = 20
for epoch in range(1, n_epochs + 1):
    epoch_total_loss = 0
    for i, (review, cat) in enumerate(dataloader):

        optimizer.zero_grad() # Clears existing gradients from previous epoch
        output = model(review)

  
        loss = criterion(output, cat.float().view(-1, 1))
        loss.backward() # Does backpropagation and calculates gradients
        optimizer.step() # Updates the weights accordingly

        epoch_total_loss += loss.item() # Keep track of the total loss
    loss_history.append(epoch_total_loss/len(dataloader))


    print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
    print("Loss: {:.4f}".format(epoch_total_loss/ len(dataloader)))

Epoch: 1/20............. Loss: 0.6990
Epoch: 2/20............. Loss: 0.6934
Epoch: 3/20............. Loss: 0.6623
Epoch: 4/20............. Loss: 0.5556
Epoch: 5/20............. Loss: 0.5100
Epoch: 6/20............. Loss: 0.4842
Epoch: 7/20............. Loss: 0.4530
Epoch: 8/20............. Loss: 0.4152
Epoch: 9/20............. Loss: 0.3950
Epoch: 10/20............. Loss: 0.3809
Epoch: 11/20............. Loss: 0.3693
Epoch: 12/20............. Loss: 0.3491
Epoch: 13/20............. Loss: 0.3474
Epoch: 14/20............. Loss: 0.3365
Epoch: 15/20............. Loss: 0.3249
Epoch: 16/20............. Loss: 0.3154
Epoch: 17/20............. Loss: 0.3074
Epoch: 18/20............. Loss: 0.3006
Epoch: 19/20............. Loss: 0.2932
Epoch: 20/20............. Loss: 0.2834


### Pre trained word embeddings.

next we will use pretrained word embeddings to get a better result.

In [17]:
word_indexes = imdb.get_word_index()
reversed_word_indexes = {index: key for key, index in word_indexes.items()}

In [21]:
reversed_word_indexes[2]

'and'

In [26]:
### we use a dataset, but we cheat a little by using keras
import torch

maxlen = 250
num_words = 2500
class ImdbDatasetWordsTrain(Dataset):
    
    '''
    Datasets needs an __init__, __get_item__ and __len__
    '''
    
    def __init__(self,):
        self.samples = []
        (x_train, y_train), (x_test, y_test) = imdb.load_data(skip_top=10, num_words=num_words, oov_char=2, maxlen=maxlen)

        x_train = [x + [0] * (maxlen- len(x)) for x in x_train]

        for x, y in zip(x_train, y_train):
            
            self.samples.append(self.get_word_from_index(x).to(device), y))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    
    
    def get_word_from_index(self, index):
        word = ""
        try:
            word = reversed_word_indexes[index]
        except:
            word = "<unk>"
        return word

In [27]:
dataset = ImdbDatasetWordsTrain()
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, )

TypeError: new(): invalid data type 'str'

In [42]:
from torchtext.vocab import Vectors 

In [43]:
glove = Vectors(name="./glove/glove.6B.100d.txt")

100%|██████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:38<00:00, 10489.28it/s]


BadZipFile: File is not a zip file

In [44]:
print(dir(glove))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'cache', 'dim', 'get_vecs_by_tokens', 'itos', 'stoi', 'unk_init', 'vectors']


In [1]:
from torchtext import data, datasets
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(train, vectors=glove)
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_size=3, device=0)

NameError: name 'glove' is not defined

In [65]:
for i, batch in enumerate(train_iter):
    print(batch.text)
    print(TEXT.vocab.vectors[batch.text])
    break

(tensor([[1952, 2188,   48,  ...,    1,    1,    1],
        [   9,   82,  200,  ...,   12,   39, 4229],
        [  10,    7,   30,  ...,    1,    1,    1]]), tensor([194, 361, 187]))


IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [3, 361], [3]

In [73]:
TEXT.vocab.vectors[TEXT.vocab.stoi['']]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'cache',
 'dim',
 'get_vecs_by_tokens',
 'itos',
 'stoi',
 'unk_init',
 'vectors']

In [111]:
type(train_iter)

torchtext.data.iterator.BucketIterator

In [138]:
vocab_size = len(TEXT.vocab)
embedding_dim = 100
n_hidden = 64
n_out = 2

class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec) # load pretrained vectors
        self.emb.weight.requires_grad = False # make embedding non trainable
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden)
        self.out = nn.Linear(self.n_hidden, self.n_out)
        
    def forward(self, seq):
        embs = self.emb(seq)
        gru_out, self.h = self.gru(embs)
        outp = self.out(gru_out[:, -1])
        return outp
    

          

In [139]:
m = EmbeddingModel(vocab_size, embedding_dim, n_hidden, 1, 
                             train.fields['text'].vocab.vectors)

In [140]:
train_loader = torch.utils.data.DataLoader(dataset=train_iter, batch_size=3, shuffle=True)


In [145]:
loss_history = []
test_history = []
acc_history = []
n_epochs = 5

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), 1e-3)
for epoch in range(1, n_epochs + 1):
    epoch_total_loss = 0
    for i, batch in enumerate(train_iter):

        optimizer.zero_grad() # Clears existing gradients from previous epoch
        
        output = m(batch.text[0])
        
        
        
        loss = criterion(output, batch.label.float().view(-1, 1) - 1)
        loss.backward() # Does backpropagation and calculates gradients
        optimizer.step() # Updates the weights accordingly

        epoch_total_loss += loss.item() # Keep track of the total loss
    loss_history.append(epoch_total_loss/len(dataloader))

        
    print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
    print("Loss: {:.4f}".format(epoch_total_loss/ len(dataloader)))

Epoch: 1/5............. Loss: 24314.4346
Epoch: 2/5............. Loss: 2148.3592
Epoch: 3/5............. Loss: 31.3222
Epoch: 4/5............. Loss: 28.5721
Epoch: 5/5............. Loss: 28.4550
