In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import SGD, Adagrad # Adam doesn't currently support autograd with embedding layers
import numpy as np
from tflearn.data_utils import pad_sequences
import pickle as pkl
from keras.utils import to_categorical
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from pytorch_imdb_datahandle import lazy_load_imdb_data
#from pytorch_TextRNN_model2 import TextRNN


MAX_FEATURE = 10000
SENTENCE_LEN = 300
EMBEDDING_DIMS = 100
BATCH_SIZE = 50
NGRAME_RANGE = 1
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 2
HIDDEN_DIM = 256
DROPOUT = 0.5
N_LAYERS = 2
BIDIRECTIONAL = True

Using TensorFlow backend.


In [3]:

def create_glove_embeddings(embedding_dims, max_feature):
    def get_word2index_dict(path='imdb_word_index.json'):
        with open(path) as f:
            return json.load(f)

    # A dictionary mapping words to an integer index
    word_index = get_word2index_dict()  # {word:index}

    # The first indices are reserved
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    print('Pretrained embeddings GloVe is loading...')

    embeddings_index = {}
    f = open('/liruishaer/Work2/NLP_models/glove.6B/glove.6B.%id.txt' % embedding_dims)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors in GloVe embedding' % len(embeddings_index))

    embedding_matrix = np.zeros((max_feature, embedding_dims))
    # embedding_matrix = torch.zeros(MAX_FEATURE, EMBEDDING_DIMS)

    for word, i in word_index.items():
        if i >= max_feature:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            # embedding_matrix[i,:] = torch.from_numpy(embedding_vector)

    embedding_matrix = torch.tensor(embedding_matrix)
    return embedding_matrix


class MyData(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        print(f"x.shape: {self.x.shape}")
        print(f"y.shape: {self.y.shape}")

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        y_i = torch.LongTensor([self.y[idx]])
        x_i = torch.LongTensor(self.x[idx].tolist())

        return {"x": x_i, "y": y_i}


In [4]:

def test():
    model.eval()
    test_loss = 0
    correct = 0
    for batch in tqdm(testing_loader):
        batch_x = Variable(batch["x"])
        outputs = model(batch_x)
        batch_y = Variable(batch['y'].reshape(1, -1).squeeze(0))
        # test_loss += binary_loss(outputs, batch_y)
        test_loss += binary_loss(outputs, batch_y).data

        prediction = outputs.data.max(1, keepdim=True)[1]
        # label = batch_y.data.max(1, keepdim=True)[1]
        label = batch['y'].data
        correct += prediction.eq(torch.LongTensor(label)).sum()

    test_loss /= len(testing_loader.dataset)
    accuracy = 100. * correct / len(testing_loader.dataset)
    print(f'Average Test loss: {test_loss}')
    print(f'Accuracy: {accuracy}')


In [5]:

(x_train, y_train), (x_test, y_test) = lazy_load_imdb_data(ngram_range=1, max_features=MAX_FEATURE, sentence_len=SENTENCE_LEN)
training_data = MyData(x_train, y_train)
testing_data = MyData(x_test, y_test)
training_loader = DataLoader(training_data, batch_size=BATCH_SIZE)
testing_loader = DataLoader(testing_data, batch_size=BATCH_SIZE)


embedding_matrix = create_glove_embeddings(EMBEDDING_DIMS,MAX_FEATURE)


data-1-10000-300.pkl
lazy loading...
Lazy load successful
x.shape: (25000, 300)
y.shape: (25000,)
x.shape: (25000, 300)
y.shape: (25000,)
Pretrained embeddings GloVe is loading...
Found 400000 word vectors in GloVe embedding


In [11]:
import time

def train(epoch):
    model.train()
    print('-' * 10)
    print(f'Epoch: {epoch+1}')
    
    batch_num = 0
    for batch in training_loader:
        batch_num += 1
        # Get the inputs and wrap as Variables
        batch_x = Variable(batch["x"])
#         print('batch_x.shape:',batch_x.shape)
        # batch_y = Variable(batch["y"])
        batch_y = Variable(batch['y'].reshape(1, -1).squeeze(0))
#         print('batch_y.shape:',batch_y.shape)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        print('-------------------')
        start = time.time()
        outputs = model(batch_x)

#         print("outputs.shape:",outputs.shape)
#         print("batch_y.shape:",batch_y.shape)
#         print("outputs",outputs)
#         print("batch_y",batch_y)

        loss = binary_loss(outputs, batch_y)
        loss.backward()
        optimizer.step()
        end = time.time()
        print('use_time:',end-start)
        # test()
        print('************')
        
        
        # batch test
        prediction = outputs.data.max(1, keepdim=True)[1]
        label = batch['y'].data
        correct = prediction.eq(torch.LongTensor(label)).sum()
        train_acc = correct.float() / len(batch_x)
        print('batch:',batch_num,'\ttrain_loss:',loss.data,'\ttrain_acc:',train_acc)



In [12]:
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embeds = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x = [sent len, batch size]
        x = x.permute(1,0)
#         print('x.shape:',x.shape)

        embedded = self.dropout(self.embeds(x))
#         print('embedded.shape:',embedded.shape)

        # embedded = [sent len, batch size, emb dim]

        output, (hidden, cell) = self.rnn(embedded)

        # output = [sent len, batch size, hid dim * num directions]
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]

        # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # and apply dropout

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
#         print('hidden.shape:',hidden.shape)

        # hidden = [batch size, hid dim * num directions]
        
        out = self.fc(hidden.squeeze(0))
#         print('out.shape:',out.shape)

        return out


In [13]:
model = TextRNN(MAX_FEATURE,EMBEDDING_DIMS,HIDDEN_DIM,OUTPUT_DIM,N_LAYERS,BIDIRECTIONAL,DROPOUT)
model.embeds.weight.data.copy_(embedding_matrix)


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.5569,  0.3345,  0.0683,  ...,  0.0375, -0.5230,  0.5233],
        [ 0.0453,  0.3146,  0.6410,  ..., -0.1689, -1.0540,  0.4726],
        [ 0.3994,  0.5463,  0.3801,  ...,  0.4579, -0.1834,  0.1226]])

In [14]:
binary_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [18]:
# n_epochs = 1
# for i in range(n_epochs):
#     train(i)

In [50]:
batch_num = 0
batch = None
for cur_batch in training_loader:
    batch = cur_batch
    batch_num += 1
    break
# Get the inputs and wrap as Variables
batch_x = Variable(batch["x"])
# batch_y = Variable(batch["y"])
batch_y = Variable(batch['y'].reshape(1, -1).squeeze(0))
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
print('-------------------')
start = time.time()
outputs = model(batch_x)

loss = binary_loss(outputs, batch_y)
loss.backward()
optimizer.step()
end = time.time()
print('use_time:',end-start)
# test()
print('************')


# batch test
prediction = outputs.data.max(1, keepdim=True)[1]
label = batch['y'].data
correct = prediction.eq(torch.LongTensor(label)).sum()
train_acc = correct.float() / len(batch_x)
print('batch:',batch_num,'\ttrain_loss:',loss.data,'\ttrain_acc:',train_acc)
    
#     break

-------------------
use_time: 23.362332344055176
************
batch: 1 	train_loss: tensor(0.6678) 	train_acc: tensor(0.5400)


In [57]:
correct

tensor(27)

In [56]:
 correct.item() / 50

0.54

In [49]:
newc = correct.float()/50
newc

tensor(0.5200)

In [59]:
aa = prediction.eq(torch.LongTensor(label)).sum().item()
aa

27

In [54]:
loss.item()

0.6677905321121216