In [2]:
import torch
from torch import nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import logging 
from d2l import torch as d2l

devices = d2l.try_all_gpus()

print(devices)

[device(type='cuda', index=0)]


In [4]:
import os
import numpy as np
#src: https://classic.d2l.ai/_modules/d2l/mxnet.html#load_data_imdb

d2l.DATA_HUB['aclImdb'] = (
    'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
    '01ada507287d82875905620988597833ad4e0903')

def read_imdb(data_dir, is_train):
    """Read the IMDb review dataset text sequences and labels.

    Defined in :numref:`sec_sentiment`"""
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                   label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

def load_data_imdb(batch_size, num_steps=500):
    """Return data iterators and the vocabulary of the IMDb review dataset.
    Defined in :numref:`sec_sentiment`"""
    data_dir = d2l.download_extract('aclImdb', 'aclImdb')
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    train_features = np.array([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = np.array([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, train_data[1]), batch_size)
    test_iter = d2l.load_array((test_features, test_data[1]), batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab


In [None]:
batch_size = 64
train_iter, test_iter, vocab = d2l.load_data_imdb(batch_size)

In [4]:
class BiRNN(nn.Module):
  def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
    super(BiRNN, self).__init__(**kwargs)

    # self.embedding = BertModel.from_pretrained('bert-base-uncased')
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers = num_layers, bidrectional=True)

    self.decoder = nn.Linear(4 * num_hiddens, 2)

  def forward(self, inputs):
    # original inputs = (# batch, # steps)
    #transform inputs to (# steps, # batch)
    embeddings = self.embedding(inputs.T)
    self.encoder.flatten_parameters()
    #output shape = (# steps, # batch, # word vector dim)
    outputs, _ = self.encoder(embeddings)

    print('output shape = ', outputs.shape)

    encoding = torch.cat((outputs[0], outputs[-1]), dim = 1)
    print('encoding shape after concat = ',encoding.shape)

    outs = self.decoder(encoding) 

    return outs
 

In [None]:
embed_size, num_hiddens, num_layers, devices = 100, 100, 2, d2l.try_all_gpus()
net = BiRNN(len(vocab), num_hiddens, num_layers,)