In [7]:
import codecs
import random
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

use_cuda = torch.cuda.is_available()

In [2]:
def load_data(fpath, label):
    data = []
    with codecs.open(fpath, 'r', 'utf-8', errors='ignore') as f:
        lines = f.readlines()
        for l in lines:
            l = l.rstrip()
            data.append((l.split(' '), label))
    return data
pos = load_data('./dataset/rt-polaritydata/rt-polarity.pos', 1)
neg = load_data('./dataset/rt-polaritydata/rt-polarity.neg', 0)
data = pos + neg

In [3]:
max_sentence_len = max([len(sentence) for sentence, _ in data])
print('sentence maxlen', max_sentence_len)

vocab = []
for d, _ in data:
    for w in d:
        if w not in vocab: vocab.append(w)
vocab = sorted(vocab)
vocab_size = len(vocab)
print('vocab examples:', vocab[:10])
print('vocab size', len(vocab))

w2i = {w:i for i,w in enumerate(vocab)}
i2w = {i:w for i,w in enumerate(vocab)}

sentence maxlen 60
vocab examples: ['', '!', '"', '#3', '#9', '$1', '$100', '$20', '$40', '$50-million']
vocab size 21384


In [4]:
# split data into train and test data
div_idx = (int)(len(data) * 0.8)
random.shuffle(data)
train_data = data[:div_idx]
test_data = data[div_idx:]
print('n_train', len(train_data))
print('n_test', len(test_data))

n_train 8529
n_test 2133


In [12]:
class Net(nn.Module):
    def __init__(self, vocab_size, embd_size, out_chs, filter_heights):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embd_size)
        # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv = nn.ModuleList([nn.Conv2d(1, out_chs, (fh, embd_size)) for fh in filter_heights])
        self.dropout = nn.Dropout(.5)
        self.fc1 = nn.Linear(out_chs*len(filter_heights), 1)
        
    def forward(self, x):
        x = self.embedding(x) # (N, seq_len, embd_dim)
        x = x.unsqueeze(1) # (N, Cin, W, embd_dim), insert Channnel-In dim
        # Conv2d
        #    Input : (N,Cin, Hin, Win )
        #    Output: (N,Cout,Hout,Wout) 
        # squeeze(3) means 2D to 1D; (N,Cout,Hout,Wout) -> [(N,Cout,Hout==seq_len)] * len(filter_heights)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]
        # max_pool1d(input, kernel_size, ..
        # (N, Cout, seq_len) --(max_pool1d)--> (N, Cout, 1) --(squeeze(2))--> (N, Cout)
        # [(N, Cout)]  len(filter_heights)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1) # (N, Cout*len(filter_heights))
        x = self.dropout(x)
        x = self.fc1(x)
        probs = F.sigmoid(x)
        return probs

In [14]:
def train(model, data, batch_size, n_epoch):
    model.train() # Sets the module in training mode. This has any effect only on modules such as Dropout or BatchNorm.
    if use_cuda:
        model.cuda()
    losses = []
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    for epoch in range(n_epoch):
        epoch_loss = 0.0
        random.shuffle(data)
        for i in range(0, len(data)-batch_size, batch_size): # discard some last elements
            in_data, labels = [], []
            for sentence, label in data[i: i+batch_size]:
                index_vec = [w2i[w] for w in sentence]
                pad_len = max(0, max_sentence_len - len(index_vec))
                index_vec += [0] * pad_len
                index_vec = index_vec[:max_sentence_len] ## TBD for same len
                in_data.append(index_vec)
                labels.append(label)
            sent_var = Variable(torch.LongTensor(in_data))
            if use_cuda: sent_var = sent_var.cuda()

            target_var = Variable(torch.Tensor(labels).unsqueeze(1))
            if use_cuda: target_var = target_var.cuda()
            optimizer.zero_grad()
            probs = model(sent_var)
            loss = F.binary_cross_entropy(probs, target_var)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.data[0]
        print('epoch: {:d}, loss: {:.3f}'.format(epoch, epoch_loss))
        losses.append(epoch_loss)
    print('Training avg loss: {:.3f}'.format(sum(losses)/len(losses)))
        
    return model, losses

def test(model, data, n_test, min_sentence_len):
    model.eval()
    loss = 0
    correct = 0
    for sentence, label in data[:n_test]:
        if len(sentence) < min_sentence_len:  # to short for CNN's filter
            continue
        index_vec = [w2i[w] for w in sentence]
        sent_var = Variable(torch.LongTensor([index_vec]))
        if use_cuda: sent_var = sent_var.cuda()
        out = model(sent_var)
        score = out.data[0][0]
        pred = 1 if score > .5 else 0
        if pred == label:
            correct += 1
        loss += math.pow((label-score), 2)
    print('Test acc: {:.3f} ({:d}/{:d})'.format(correct/n_test, correct, n_test))
    print('Test loss: {:.3f}'.format(loss/n_test))
        
out_ch = 100
embd_size = 128
batch_size = 64
n_epoch = 50
filter_variations = [[1], [1,2], [1,2,3,4]]
for fil in filter_variations:
    print('filter:', fil)
    model = Net(vocab_size, embd_size, out_ch, fil)
#     print(model)
    model, losses = train(model, train_data, batch_size, n_epoch)
    test(model, test_data, len(test_data), max(fil))
    print('')

filter: [1]
epoch: 0, loss: 94.857
epoch: 1, loss: 89.101
epoch: 2, loss: 85.506
epoch: 3, loss: 82.430
epoch: 4, loss: 79.546
epoch: 5, loss: 76.811
epoch: 6, loss: 73.923
epoch: 7, loss: 70.147
epoch: 8, loss: 64.702
epoch: 9, loss: 60.813
epoch: 10, loss: 56.459
epoch: 11, loss: 51.050
epoch: 12, loss: 46.446
epoch: 13, loss: 41.079
epoch: 14, loss: 36.618
epoch: 15, loss: 31.984
epoch: 16, loss: 28.151
epoch: 17, loss: 24.910
epoch: 18, loss: 21.588
epoch: 19, loss: 19.572
epoch: 20, loss: 16.534
epoch: 21, loss: 15.342
epoch: 22, loss: 13.573
epoch: 23, loss: 11.835
epoch: 24, loss: 11.524
epoch: 25, loss: 11.072
epoch: 26, loss: 9.760
epoch: 27, loss: 9.034
epoch: 28, loss: 8.402
epoch: 29, loss: 8.444
epoch: 30, loss: 7.833
epoch: 31, loss: 7.316
epoch: 32, loss: 7.276
epoch: 33, loss: 6.863
epoch: 34, loss: 6.509
epoch: 35, loss: 5.915
epoch: 36, loss: 6.797
epoch: 37, loss: 5.833
epoch: 38, loss: 5.886
epoch: 39, loss: 5.647
epoch: 40, loss: 6.045
epoch: 41, loss: 5.416
epoch: