In [313]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import SGD, Adagrad # Adam doesn't currently support autograd with embedding layers
import numpy as np
from tflearn.data_utils import pad_sequences
import pickle as pkl
from keras.utils import to_categorical
import json
from tqdm import tqdm

In [2]:
MAX_FEATURE = 10000
SENTENCE_LEN = 300
NGRAME_RANGE = 1
EMBEDDING_DIMS = 100

In [79]:
def create_ngram_set(input_list, ngram_value=2):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

def load_imdb_data():
    def load_data(path='imdb.npz', num_words=None, skip_top=0, seed=113, start_char=1, oov_char=2, index_from=3):
        # 1. load data
        with np.load(path) as f:
            x_train, labels_train = f['x_train'], f['y_train']
            x_test, labels_test = f['x_test'], f['y_test']

        # 2. shuffle train/test
        np.random.seed(seed)
        indices = np.arange(len(x_train))
        np.random.shuffle(indices)
        x_train = x_train[indices]
        labels_train = labels_train[indices]

        indices = np.arange(len(x_test))
        np.random.shuffle(indices)
        x_test = x_test[indices]
        labels_test = labels_test[indices]

        xs = np.concatenate([x_train, x_test])
        labels = np.concatenate([labels_train, labels_test])

        # 保留前3个index
        if start_char is not None:
            xs = [[start_char] + [w + index_from for w in x] for x in xs]
        elif index_from:
            xs = [[w + index_from for w in x] for x in xs]

        if not num_words:
            num_words = max([max(x) for x in xs])

        # by convention, use 2 as OOV word
        # reserve 'index_from' (=3 by default) characters:
        # 0 (padding), 1 (start), 2 (OOV)
        if oov_char is not None:
            xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
        else:
            xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

        idx = len(x_train)
        x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
        x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

        return (x_train, y_train), (x_test, y_test)

    
    global MAX_FEATURE
    print('MAX_FEATURE:', MAX_FEATURE)
        
    
    # 1. load original data
    print('loading data...')
    (trainX, trainY), (testX, testY) = load_data(num_words=MAX_FEATURE)
    print('train_data length:',len(trainX))
    print('test_data length:',len(testX))
    
    # 2. add n-gram
    if NGRAME_RANGE > 1:
        print('Adding {}-gram features'.format(NGRAME_RANGE))
        # Create set of unique n-gram from the training set.
        ngram_set = set()
        for input_list in trainX:
            for i in range(2, NGRAME_RANGE + 1):
                set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                ngram_set.update(set_of_ngram)

        # Dictionary mapping n-gram token to a unique integer.
        # Integer values are greater than max_features in order
        # to avoid collision with existing features.
        print('MAX_FEATURE:', MAX_FEATURE)
        start_index = MAX_FEATURE + 1
        token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
        indice_token = {token_indice[k]: k for k in token_indice}

        # max_features is the highest integer that could be found in the dataset.
        MAX_FEATURE = np.max(list(indice_token.keys())) + 1

        # Augmenting x_train and x_test with n-grams features
        trainX = add_ngram(trainX, token_indice, NGRAME_RANGE)
        testX = add_ngram(testX, token_indice, NGRAME_RANGE)
        print('Average train sequence length: {}'.format(np.mean(list(map(len, trainX)), dtype=int)))
        print('Average test sequence length: {}'.format(np.mean(list(map(len, testX)), dtype=int)))


    # 3.Data preprocessing      Sequence padding
    print("start padding & transform to one hot...")
    trainX = pad_sequences(trainX, maxlen=SENTENCE_LEN, value=0.)  # padding to max length
    testX = pad_sequences(testX, maxlen=SENTENCE_LEN, value=0.)  # padding to max length
    print('x_train shape:', trainX.shape)
    print('x_test shape:', testX.shape)

    print("end padding & transform to one hot...")
    return (trainX,trainY),(testX,testY)
#     return (trainX, to_categorical(trainY)), (testX, to_categorical(testY))



def lazy_load_imdb_data(ngram_range=1, max_features=20000, maxlen=400):
    filename = "-".join(["data", str(ngram_range), str(max_features), str(maxlen)])
    filename += ".pkl"
    print(filename)

    try:
        with open(filename, "rb") as source:
            print('lazy loading...')
            data = pkl.load(source)
            print("Lazy load successful")
            return data
    except FileNotFoundError:
#         data = fetch_imdb_data(ngram_range, max_features, maxlen)
        data = load_imdb_data()
        with open(filename, "wb") as target:
            pkl.dump(data, target)
        return data



In [82]:
def get_word2index_dict(path='imdb_word_index.json'):
    with open(path) as f:
        return json.load(f)

# A dictionary mapping words to an integer index
word_index = get_word2index_dict()  # {word:index}

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

In [83]:
def create_glove_embeddings():
    print('Pretrained embeddings GloVe is loading...')

    embeddings_index = {}
    f = open('/liruishaer/Work2/NLP_models/glove.6B/glove.6B.%id.txt' % EMBEDDING_DIMS)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors in GloVe embedding' % len(embeddings_index))

    embedding_matrix = np.zeros((MAX_FEATURE, EMBEDDING_DIMS))
    #embedding_matrix = torch.zeros(MAX_FEATURE, EMBEDDING_DIMS)

    for word, i in word_index.items():
        if i >= MAX_FEATURE:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            #embedding_matrix[i,:] = torch.from_numpy(embedding_vector)
    
    embedding_matrix = torch.tensor(embedding_matrix)
    return embedding_matrix



Pretrained embeddings GloVe is loading...
Found 400000 word vectors in GloVe embedding


In [80]:
(x_train, y_train), (x_test, y_test) = lazy_load_imdb_data()
x_train.shape

data-1-20000-400.pkl
MAX_FEATURE: 10000
loading data...
train_data length: 25000
test_data length: 25000
start padding & transform to one hot...
x_train shape: (25000, 300)
x_test shape: (25000, 300)
end padding & transform to one hot...


(25000, 300)

In [None]:
embedding_matrix = create_glove_embeddings()

In [260]:
from torch.utils.data import Dataset, DataLoader

class MyData(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        print(f"x.shape: {self.x.shape}")
        print(f"y.shape: {self.y.shape}")
        
    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, idx):
#         y_i = torch.FloatTensor(self.y[idx, :])
#         x_i = torch.LongTensor(self.x[idx].tolist())
        
        y_i = torch.LongTensor([self.y[idx]])
        x_i = torch.LongTensor(self.x[idx].tolist())

        return {"x":x_i, "y":y_i}

In [282]:

def test():
    model.eval()
    test_loss = 0
    correct = 0
    for batch in tqdm(testing_loader):
        batch_x = Variable(batch["x"])
        outputs = model(batch_x)
        batch_y = Variable(batch['y'].reshape(1,-1).squeeze(0))
        test_loss += binary_loss(outputs, batch_y)

        prediction = outputs.data.max(1, keepdim=True)[1]
        #label = batch_y.data.max(1, keepdim=True)[1]
        label = batch['y'].data
        correct += prediction.eq(torch.LongTensor(label)).sum()
    
    test_loss /= len(testing_loader.dataset)
    accuracy = 100. * correct / len(testing_loader.dataset)
    print(f'Average Test loss: {test_loss.data[0]}')
    print(f'Accuracy: {accuracy}')

In [331]:

def train(epoch):
    print('-'*10)
    print(f'Epoch: {epoch+1}')
    #for batch in tqdm(training_loader):
    for batch in training_loader:
        # Get the inputs and wrap as Variables
        batch_x = Variable(batch["x"])
        #batch_y = Variable(batch["y"])
        batch_y = Variable(batch['y'].reshape(1,-1).squeeze(0))
    
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(batch_x)
        
#         print("outputs.shape:",outputs.shape)
#         print("batch_y.shape:",batch_y.shape)
#         print("outputs",outputs)
#         print("batch_y",batch_y)
        
        loss = binary_loss(outputs, batch_y)
        loss.backward()
        optimizer.step()

In [261]:
training_data = MyData(x_train, y_train)
testing_data = MyData(x_test, y_test)

training_loader = DataLoader(training_data, batch_size=5)
testing_loader = DataLoader(testing_data, batch_size=5)

x.shape: (25000, 300)
y.shape: (25000,)
x.shape: (25000, 300)
y.shape: (25000,)


In [333]:
class TorchFastText(nn.Module):
    
    def __init__(self, max_features, embedding_dims, maxlen, num_classes=2):
        super(TorchFastText, self).__init__()
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.maxlen = maxlen
        self.num_classes = num_classes
        
#         self.embeds = nn.EmbeddingBag(max_features, embedding_dims)
#         self.linear = nn.Linear(self.embedding_dims, self.num_classes)
        
        self.embeds = nn.Embedding(max_features, embedding_dims)
        self.linear = nn.Linear(self.embedding_dims, self.num_classes)
        
    def forward(self, x):
        embedded_sentence = self.embeds(x)
        print('embeded.shape:',embedded_sentence.shape)
        
        pooled = F.avg_pool2d(embedded_sentence, (embedded_sentence.shape[1], 1)).squeeze(1)         
        print('pooled.shape:',pooled.shape)
        
        predicted = self.linear(pooled)
        print('predicted.shape:',predicted.shape)
        
        return predicted


model = TorchFastText(MAX_FEATURE, EMBEDDING_DIMS, SENTENCE_LEN)

In [334]:
model.embeds.weight.data.copy_(embedding_matrix)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.5569,  0.3345,  0.0683,  ...,  0.0375, -0.5230,  0.5233],
        [ 0.0453,  0.3146,  0.6410,  ..., -0.1689, -1.0540,  0.4726],
        [ 0.3994,  0.5463,  0.3801,  ...,  0.4579, -0.1834,  0.1226]])

In [335]:
import torch.optim as optim

#binary_loss = nn.BCELoss()
binary_loss = nn.CrossEntropyLoss()
# optimizer = Adagrad(model.parameters(), lr=0.01)
optimizer = optim.Adam(model.parameters())


In [336]:
n_epochs = 1

for i in range(n_epochs):
    train(i)
#     test()
# test()

----------
Epoch: 1
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicte

embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size(

embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size([5, 2])
embeded.shape: torch.Size([5, 300, 100])
pooled.shape: torch.Size([5, 100])
predicted.shape: torch.Size(

KeyboardInterrupt: 