In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import SGD, Adagrad # Adam doesn't currently support autograd with embedding layers
import numpy as np
from tflearn.data_utils import pad_sequences
import pickle as pkl
from keras.utils import to_categorical
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from pytorch_imdb_datahandle import lazy_load_imdb_data


Using TensorFlow backend.


In [52]:

MAX_FEATURE = 10000
SENTENCE_LEN = 250
EMBEDDING_DIMS = 100
NGRAME_RANGE = 1
N_FILTERS = 50
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 2
DROPOUT = 0.5

In [53]:

class MyData(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        print(f"x.shape: {self.x.shape}")
        print(f"y.shape: {self.y.shape}")

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        #         y_i = torch.FloatTensor(self.y[idx, :])
        #         x_i = torch.LongTensor(self.x[idx].tolist())

        y_i = torch.LongTensor([self.y[idx]])
        x_i = torch.LongTensor(self.x[idx].tolist())

        return {"x": x_i, "y": y_i}



In [54]:


def create_glove_embeddings(embedding_dims, max_feature):
    def get_word2index_dict(path='imdb_word_index.json'):
        with open(path) as f:
            return json.load(f)

    # A dictionary mapping words to an integer index
    word_index = get_word2index_dict()  # {word:index}

    # The first indices are reserved
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    print('Pretrained embeddings GloVe is loading...')

    embeddings_index = {}
    f = open('/liruishaer/Work2/NLP_models/glove.6B/glove.6B.%id.txt' % embedding_dims)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors in GloVe embedding' % len(embeddings_index))

    embedding_matrix = np.zeros((max_feature, embedding_dims))
    # embedding_matrix = torch.zeros(MAX_FEATURE, EMBEDDING_DIMS)

    for word, i in word_index.items():
        if i >= max_feature:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            # embedding_matrix[i,:] = torch.from_numpy(embedding_vector)

    embedding_matrix = torch.tensor(embedding_matrix)
    return embedding_matrix



In [55]:
embedding_matrix = create_glove_embeddings(EMBEDDING_DIMS,MAX_FEATURE)

Pretrained embeddings GloVe is loading...
Found 400000 word vectors in GloVe embedding


In [56]:

(x_train, y_train), (x_test, y_test) = lazy_load_imdb_data(ngram_range=1, max_features=MAX_FEATURE, sentence_len=SENTENCE_LEN)
training_data = MyData(x_train, y_train)
testing_data = MyData(x_test, y_test)
training_loader = DataLoader(training_data, batch_size=444)
testing_loader = DataLoader(testing_data, batch_size=444)


data-1-10000-250.pkl
lazy loading...
Lazy load successful
x.shape: (25000, 250)
y.shape: (25000,)
x.shape: (25000, 250)
y.shape: (25000,)


## model

In [57]:
# binary_loss = nn.BCELoss()
binary_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [58]:

from torch import nn
import torch
import torch.nn.functional as F



class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()

        self.embeds = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList(
            [nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        print('init model.....')

    def forward(self, x):
        # x = [batch size, sent len]

        embedded = self.embeds(x)

        # embedded = [batch size, sent len, emb dim]

        embedded = embedded.unsqueeze(1)

        # embedded = [batch size, 1, sent len, emb dim]

        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]

        # conv_n = [batch size, n_filters, sent len - filter_sizes[n]]

        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # pooled_n = [batch size, n_filters]

        cat = self.dropout(torch.cat(pooled, dim=1))

        # cat = [batch size, n_filters * len(filter_sizes)]
        
        z = self.fc(cat)

        z = z.squeeze(1)

        return z

In [59]:

def train(epoch):
    print('-' * 10)
    print(f'Epoch: {epoch+1}')
    # for batch in tqdm(training_loader):
    for batch in training_loader:
        # Get the inputs and wrap as Variables
        batch_x = Variable(batch["x"])
        # batch_y = Variable(batch["y"])
        batch_y = Variable(batch['y'].reshape(1, -1).squeeze(0))
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model(batch_x)

#         print("outputs.shape:",outputs.shape)
#         print("batch_y.shape:",batch_y.shape)
#         print("outputs",outputs)
#         print("batch_y",batch_y)

        loss = binary_loss(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        test()


In [60]:

def test():
    model.eval()
    test_loss = 0
    correct = 0
    for batch in tqdm(testing_loader):
        batch_x = Variable(batch["x"])
        outputs = model(batch_x)
        batch_y = Variable(batch['y'].reshape(1,-1).squeeze(0))
        test_loss += binary_loss(outputs, batch_y)

        prediction = outputs.data.max(1, keepdim=True)[1]
        #label = batch_y.data.max(1, keepdim=True)[1]
        label = batch['y'].data
        correct += prediction.eq(torch.LongTensor(label)).sum()
    
    test_loss /= len(testing_loader.dataset)
    accuracy = 100. * correct / len(testing_loader.dataset)
    
    print(f'loss: {test_loss.data[0]} \t Accuracy: {accuracy}')
    
#     print(f'Average Test loss: {test_loss.data[0]}')
#     print(f'Accuracy: {accuracy}')

In [61]:
model = TextCNN(MAX_FEATURE, EMBEDDING_DIMS,N_FILTERS,FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model.embeds.weight.data.copy_(embedding_matrix)

init model.....


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.5569,  0.3345,  0.0683,  ...,  0.0375, -0.5230,  0.5233],
        [ 0.0453,  0.3146,  0.6410,  ..., -0.1689, -1.0540,  0.4726],
        [ 0.3994,  0.5463,  0.3801,  ...,  0.4579, -0.1834,  0.1226]])

In [None]:
n_epochs = 5

for i in range(n_epochs):
    train(i)

----------
Epoch: 1


 37%|███▋      | 21/57 [00:31<04:40,  7.80s/it]