In [78]:
import random
import re
import sys
import time
from collections import defaultdict

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm_notebook

In [79]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [80]:
def build_data():
    """
    Loads data
    """
    files = ["rt-polarity.neg", "rt-polarity.pos"]
    revs = []
    vocab = defaultdict(float)
    max_l = 0
    for i in range(2):
        with open(files[i], "r", encoding="cp1252") as f:
            for line in f:
                orig_rev = clean_str(line.strip())
                words = set(orig_rev.split())
                for word in words:
                    vocab[word] += 1
                datum  = {"y":i, "text": orig_rev}
                if len(orig_rev.split()) > max_l:
                    max_l = len(orig_rev.split())
                revs.append(datum)
    return revs, vocab, max_l

In [81]:
# read MR dataset
print("loading data...", end=' ')
revs, vocab, max_l = build_data()
print("data loaded!")

print("number of sentences: " + str(len(revs)))             # 10662
print("vocab size: " + str(len(vocab)))                     # 18764
print("max sentence length: " + str(max_l))                 # 56

loading data... data loaded!
number of sentences: 10662
vocab size: 18764
max sentence length: 56


In [82]:
def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())  # 3000000, 300
        binary_len = 4 * layer1_size
        for line in tqdm_notebook(range(vocab_size), desc='load_bin_vec'):
            word = []
            while True:
                ch = f.read(1).decode("latin1")
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            if word in vocab:
                word_vecs[word] = torch.from_numpy(np.frombuffer(f.read(binary_len), dtype='float32'))
            else:
                f.read(binary_len)
    return word_vecs, layer1_size

In [83]:
# read pre-trained word2vec
print("loading word2vec vectors...", end=' ')
word_vecs, k = load_bin_vec("GoogleNews-vectors-negative300.bin", vocab)
print("word2vec loaded!")

print("num words already in word2vec: " + str(len(word_vecs)))    # 16448

loading word2vec vectors... 

HBox(children=(IntProgress(value=0, description='load_bin_vec', max=3000000, style=ProgressStyle(description_w…

word2vec loaded!
num words already in word2vec: 16448


In [84]:
# Embedding layer
embedding = nn.Embedding(len(vocab)+1, k, padding_idx=0)
W = {}
word_idx_map = {}
W["rand"] = W["vec"] = embedding(torch.LongTensor(range(len(vocab)+1))) # torch.Size([18765, 300])
for word, i in zip(vocab, range(1,len(vocab)+1)):
    if word in word_vecs:
        W["vec"][i] = word_vecs[word]
    word_idx_map[word] = i
print("dataset created!")

dataset created!


In [85]:
def get_idx_from_sent(sent, word_idx_map, max_l=56, k=300):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l:
        x.append(0)
    return x

In [86]:
def make_idx_data(revs, word_idx_map, max_l=56, k=300):
    """
    Transforms sentences into a 2-d matrix.
    """
    train_x_idx, train_y, test_x_idx, test_y = [], [], [], []
    random.shuffle(revs)
    for rev, i in zip(revs, range(len(revs))):
        sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k)
        if i < len(revs) / 10:
            test_x_idx.append(sent)
            test_y.append(rev["y"])
        else:
            train_x_idx.append(sent)
            train_y.append(rev["y"])
    train_y = torch.LongTensor(train_y)
    test_y = torch.LongTensor(test_y)
    return train_x_idx, train_y, test_x_idx, test_y

In [87]:
def make_data(x_idx, W, max_l=56, k=300):
    x = torch.Tensor(len(x_idx), 1, max_l * k)
    for i, sent in enumerate(x_idx):
        xx = []
        for idx in sent:
            xx.append(W[idx])
        x[i] = torch.cat(tuple(xx))
    return x

In [88]:
class CNN(nn.Module):
    def __init__(self, hs, feature, k, p):
        super(CNN, self).__init__()
        for h in hs:
            conv = nn.Conv1d(1, feature, h * k, stride=k)
            setattr(self, 'conv%d' % h, conv)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.drop = nn.Dropout(p)
        self.fc = nn.Linear(len(hs) * feature, 2)
        self.loss = nn.LogSoftmax(dim=-1)
        self.hs = hs
        
    def forward(self, x):
        outs = []
        for h in self.hs:
            conv = getattr(self, 'conv%d' % h)
            out = self.drop(self.relu(conv(x)))
            out = self.pool(out)
            outs.append(out)
        outs = torch.cat(outs, dim=1).reshape(-1, 300)
        outs = self.fc(outs)
        return self.loss(outs)

In [89]:
def cnn_trainer(train_loader, test_x, test_y, W, non_static, h=[3,4,5], feature=100, p=0.5, s=3, k=300):
    criterion = nn.CrossEntropyLoss()
    model = CNN(h, feature, k, p)
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    total_loss=0
    for epoch in tqdm_notebook(range(25), desc='epoch', leave=False):
        total_loss = 0
        for train_x, train_y in tqdm_notebook(train_loader, desc='train', leave=False):
            train_x, train_y = Variable(train_x), Variable(train_y)
            optimizer.zero_grad()
            output = model(train_x)
            loss = criterion(output, train_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.data
        if (epoch+1) % 5 == 0:
            print(epoch+1, total_loss)
    
    test_x, test_y = Variable(test_x), Variable(test_y)
    result = torch.max(model(test_x).data, 1)[1]
    accuracy = sum(test_y.data.numpy() == result.numpy()) / len(test_y.data.numpy())
    
    return accuracy

In [90]:
non_static = [True, False, True]
U = ["rand", "vec", "vec"]
accuracies = []
train_x_idx, train_y, test_x_idx, test_y = make_idx_data(revs, word_idx_map, max_l=max_l, k=k)    # 9595 1067 X 56
for i in tqdm_notebook(range(3), desc='i', leave=False):
    train_x = make_data(train_x_idx, W[U[i]], max_l=max_l, k=k) # 9595, 1, 16800
    test_x = make_data(test_x_idx, W[U[i]], max_l=max_l, k=k)
    train = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train, batch_size=50)
    accuracy = cnn_trainer(train_loader, test_x, test_y, W[U[i]], non_static[i], h=[3,4,5], feature=100, p=0.5, s=3, k=k)
    accuracies.append(accuracy)
    print(accuracy)

HBox(children=(IntProgress(value=0, description='i', max=3, style=ProgressStyle(description_width='initial')),…

HBox(children=(IntProgress(value=0, description='epoch', max=25, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

5 tensor(122.1577)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

10 tensor(105.1105)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

15 tensor(92.8276)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

20 tensor(85.3687)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

25 tensor(77.5863)
0.7207122774133083


HBox(children=(IntProgress(value=0, description='epoch', max=25, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

5 tensor(122.0233)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

10 tensor(105.8668)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

15 tensor(93.7513)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

20 tensor(84.0120)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

25 tensor(77.5493)
0.7357075913776945


HBox(children=(IntProgress(value=0, description='epoch', max=25, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

5 tensor(121.6137)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

10 tensor(105.8335)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

15 tensor(93.8257)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

20 tensor(85.0816)


HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='train', max=192, style=ProgressStyle(description_width='initi…

25 tensor(76.8355)
0.7263355201499532


In [91]:
print(accuracies)

[0.7207122774133083, 0.7357075913776945, 0.7263355201499532]


In [92]:
conv1 = nn.Conv1d(1, 10, 10, 5)
conv2 = nn.Conv1d(2, 10, 10, 5)
i1 = torch.randn(10, 1, 20)
i2 = torch.randn(10, 2, 20)
c1 = conv1(i1)
c2 = conv2(i2)
print(c1.size(), c2.size())
print(c1)
print(c2)

torch.Size([10, 10, 3]) torch.Size([10, 10, 3])
tensor([[[-4.5743e-01, -2.8455e-01, -9.7749e-02],
         [-1.2987e+00,  1.1283e+00,  8.4845e-01],
         [ 3.0455e-01, -4.5823e-01,  1.0398e-01],
         [ 8.4749e-01, -7.7719e-01,  5.1993e-02],
         [-1.1690e+00, -9.6750e-01,  2.4085e-01],
         [ 2.8571e-01,  1.7029e-01, -4.3393e-01],
         [ 7.4379e-01, -8.5026e-01, -1.0433e+00],
         [-3.1117e-01,  7.1916e-03, -3.7647e-02],
         [ 4.7953e-01, -5.5254e-01,  1.7203e-02],
         [ 7.4999e-01,  9.4306e-01, -4.0194e-01]],

        [[ 8.2265e-01,  5.4181e-01, -9.4665e-03],
         [-5.2915e-01,  1.5278e-01,  1.3603e+00],
         [ 7.0269e-02,  4.0740e-01, -8.9061e-01],
         [-2.5486e-01,  5.9061e-01, -7.9181e-01],
         [-1.2759e+00,  1.2534e-01,  5.3991e-02],
         [ 9.4586e-01, -1.0107e+00,  4.2233e-01],
         [-2.9826e-01,  5.2023e-01, -1.2679e+00],
         [ 2.8073e-01,  2.5267e-01, -9.0995e-01],
         [-6.6271e-01, -2.9449e-02,  6.9448e-01],
