In [1]:
import random
import re
import sys
import time
from collections import defaultdict
from functools import reduce

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [22]:
h = [3, 4, 5]
feature = 100
p = 0.5
s = 3
batch_size = 50
k = 300

In [3]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [4]:
# read MR dataset
print("loading data...", end=' ')
files = ["rt-polarity.neg", "rt-polarity.pos"]
revs = []
vocab = defaultdict(float)
max_l = 0
for i in range(2):
    with open(files[i], "r", encoding="cp1252") as f:
        for line in f:
            orig_rev = clean_str(line.strip())
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum  = {"y":i,
                    "text": orig_rev,
                    "num_words": len(orig_rev.split())}
            if datum["num_words"] > max_l:
                max_l = datum["num_words"]
            revs.append(datum)
print("data loaded!")

print("number of sentences: " + str(len(revs)))             # 10662
print("vocab size: " + str(len(vocab)))                     # 18764
print("max sentence length: " + str(max_l))                 # 56

loading data... data loaded!
number of sentences: 10662
vocab size: 18764
max sentence length: 56


In [5]:
# read pre-trained word2vec
print("loading word2vec vectors...", end=' ')
word_vecs = {}
with open("GoogleNews-vectors-negative300.bin", "rb") as f:
    header = f.readline()
    vocab_size, _ = map(int, header.split())  # 3000000, 300
    binary_len = 4 * k
    for line in range(vocab_size):
        word = []
        while True:
            ch = f.read(1).decode("latin1")
            if ch == ' ':
                word = ''.join(word)
                break
            if ch != '\n':
                word.append(ch)
        if word in vocab:
            word_vecs[word] = torch.from_numpy(np.frombuffer(f.read(binary_len), dtype='float32'))
        else:
            f.read(binary_len)
print("word2vec loaded!")

print("num words already in word2vec: " + str(len(word_vecs)))    # 16448

loading word2vec vectors... word2vec loaded!
num words already in word2vec: 16448


In [6]:
# Embedding layer
embedding = nn.Embedding(len(vocab)+1, k, padding_idx=0)
W = {}                                                      # torch.Size([18765, 300])
word_idx_map = {}
W["rand"] = W["vec"] = embedding(torch.LongTensor(range(len(vocab))))
for word, i in zip(word_vecs, range(1,len(word_vecs)+1)):
    W["vec"][i] = word_vecs[word]
    word_idx_map[word] = i
print("dataset created!")

dataset created!


In [7]:
def get_idx_from_sent(sent, word_idx_map, max_l, k=300):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l:
        x.append(0)
    return x

In [8]:
non_static = [True, False, True]
U = ["rand", "vec", "vec"]
results = []
train_x_idx, train_y, test_x_idx, test_y = [], [], [], []
random.shuffle(revs)
for rev, i in zip(revs, range(len(revs))):
    sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k)
    if i < len(revs) / 10:
        test_x_idx.append(sent)
        test_y.append(rev["y"])
    else:
        train_x_idx.append(sent)
        train_y.append(rev["y"])
train_x_idx = torch.tensor(train_x_idx)
train_y = torch.tensor(train_y)
test_x_idx = torch.tensor(test_x_idx)
test_y = torch.tensor(test_y)

torch.Size([9595, 56]) torch.Size([9595])


In [9]:
x = [W["rand"][idx] for idx in [sent for sent in train_x_idx]] # 9595 * 64 * 300
print(len(x), x[0].size())

9595 torch.Size([56, 300])


In [10]:
train_x = torch.Tensor(train_x_idx.size()[0], 1, max_l, k)
print(train_x.size())

torch.Size([9595, 1, 56, 300])


In [11]:
for i in range(len(x)):
    train_x[i][0] = x[i]

In [12]:
train_x = train_x.reshape(train_x.size()[0], 1, max_l * k)
print(train_x.size())

torch.Size([9595, 1, 16800])


In [13]:
train = TensorDataset(train_x, train_y)
print(train[0])
train_loader = DataLoader(train, batch_size=batch_size)

(tensor([[ 0.1094,  0.1406, -0.0317,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward>), tensor(0))


In [20]:
class CNN(nn.Module):
    def __init__(self, hs, feature, k, p):
        super(CNN, self).__init__()
        for h in hs:
            conv = nn.Conv1d(1, feature, h * k)
            setattr(self, 'conv%d' % h, conv)
        self.pool = nn.AdaptiveMaxPool1d(feature)
        self.drop = nn.Dropout(p)
        self.fc = nn.Linear(len(hs) * feature, 2)
        
    def forward(self, x, hs):
        outs = []
        for h in hs:
            conv = getattr(self, 'conv%d' % h)
            out = self.dropout(nn.ReLu(conv(x)))
            out = self.pool(out)
            outs += out
        outs = torch.cat(out, dim=-1)
        outs = self.Linear(outs)
        return nn.LogSoftmax(outs)
    
model = CNN(h, feature, k, p)

In [None]:
total_loss, total_param_norm, total_grad_norm = 0, 0, 0
avg_loss, avg_param_norm, avg_grad_norm = 0, 0, 0
sample_cnt = 0

progress_bar = tqdm(train, 
                    desc='Training: ', 
                    unit='batch'
                    ) if verbose is VERBOSE_BATCH_WISE else train
# Iterate whole train-set.
for idx, mini_batch in enumerate(progress_bar):
    x, y = mini_batch.text, mini_batch.label
    # Don't forget make grad zero before another back-prop.
    optimizer.zero_grad()

    y_hat = self.model(x)

    loss = self.get_loss(y_hat, y)
    loss.backward()

    total_loss += loss
    total_param_norm += utils.get_parameter_norm(self.model.parameters())
    total_grad_norm += utils.get_grad_norm(self.model.parameters())

    # Caluclation to show status
    avg_loss = total_loss / (idx + 1)
    avg_param_norm = total_param_norm / (idx + 1)
    avg_grad_norm = total_grad_norm / (idx + 1)

    if verbose is VERBOSE_BATCH_WISE:
        progress_bar.set_postfix_str('|param|=%.2f |g_param|=%.2f loss=%.4e' % (avg_param_norm,
                                                                                avg_grad_norm,
                                                                                avg_loss
                                                                                ))

    optimizer.step()

    sample_cnt += mini_batch.text.size(0)
    if sample_cnt >= len(train.dataset.examples):
        break

if verbose is VERBOSE_BATCH_WISE:
    progress_bar.close()

return avg_loss, avg_param_norm, avg_grad_norm