In [1]:
import codecs
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

use_cuda = torch.cuda.is_available()

In [2]:
def load_data(fpath, label):
    data = []
    with codecs.open(fpath, 'r', 'utf-8', errors='ignore') as f:
        lines = f.readlines()
        for l in lines:
            l = l.rstrip()
            data.append((l.split(' '), label))
    return data
pos = load_data('./dataset/rt-polaritydata/rt-polarity.pos', 1)
neg = load_data('./dataset/rt-polaritydata/rt-polarity.neg', 0)
data = pos + neg

In [3]:
max_sentence_len = max([len(sentence) for sentence, _ in data])
print('sentence maxlen', max_sentence_len)

vocab = []
for d, _ in data:
    for w in d:
        if w not in vocab: vocab.append(w)
vocab = sorted(vocab)
vocab_size = len(vocab)
print('vocab examples:', vocab[:10])
print('vocab size', len(vocab))

w2i = {w:i for i,w in enumerate(vocab)}
i2w = {i:w for i,w in enumerate(vocab)}

sentence maxlen 60
vocab examples: ['', '!', '"', '#3', '#9', '$1', '$100', '$20', '$40', '$50-million']
vocab size 21384


In [4]:
div_idx = (int)(len(data) * 0.8)
random.shuffle(data)
train_data = data[:div_idx]
test_data = data[div_idx:]
print('n_train', len(train_data))
print('n_test', len(test_data))

n_train 8529
n_test 2133


In [5]:
class Net(nn.Module):
    def __init__(self, vocab_size, embd_size, out_chs, filter_heights):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embd_size)
        # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv = nn.ModuleList([nn.Conv2d(1, out_chs, (fh, embd_size)) for fh in filter_heights])
        self.dropout = nn.Dropout(.9)
        self.fc1 = nn.Linear(out_chs*len(filter_heights), 1)
        
    def forward(self, x):
        x = self.embedding(x) # (N, W, embd_dim)
#         print('embd:', x.size())
        x = x.unsqueeze(1) # (N, Ci, W, embd_dim)
#         print('after unsq (add Ci)', x.size())
        # Input: (N,Cin,Lin)
        print('conv(x)', conv(x).size())
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
#         print('after cat each conv', x.size())
        x = self.dropout(x)
        x = self.fc1(x)
#         print('befo sigmodi', x.size())
        probs = F.sigmoid(x)
        return probs


In [19]:
def train(model, data, batch_size, n_epoch):
    model.train() # Sets the module in training mode. This has any effect only on modules such as Dropout or BatchNorm.
    if use_cuda:
        model.cuda()
    losses = []
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(n_epoch):
        epoch_loss = 0.0
        random.shuffle(data)
        for i in range(0, len(data)-batch_size, batch_size): # discard some last elements
            in_data, labels = [], []
            for sentence, label in data[i: i+batch_size]:
                index_vec = [w2i[w] for w in sentence]
                pad_len = max(0, max_sentence_len - len(index_vec))
                index_vec += [0] * pad_len
                index_vec = index_vec[:max_sentence_len] ## TBD for same len
                in_data.append(index_vec)
                labels.append(label)
#             print('input len:', len(index_vec))
            sent_var = Variable(torch.LongTensor(in_data))
            if use_cuda: sent_var = sent_var.cuda()

            target_var = Variable(torch.Tensor(labels).unsqueeze(1))
            if use_cuda: target_var = target_var.cuda()
#             print('input_size', sent_var.size())
#             print('targe_size', target_var.size())
            optimizer.zero_grad()
            probs = model(sent_var)
            loss = F.binary_cross_entropy(probs, target_var)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.data[0]
        print('epoch: {:d}, loss: {:.3f}'.format(epoch, epoch_loss))
        losses.append(epoch_loss)
    print('Training avg loss: {:.3f}'.format(sum(losses)/len(losses)))
        
    return model, losses

def test(model, data, n_test, min_sentence_len):
    model.eval()
    loss = 0
    correct = 0
    for sentence, label in data[:n_test]:
        if len(sentence) < min_sentence_len:  # to short for CNN
            continue
        index_vec = [w2i[w] for w in sentence]
        sent_var = Variable(torch.LongTensor([index_vec]))
        if use_cuda: sent_var = sent_var.cuda()
        out = model(sent_var)
        pred = 1 if out.data[0][0] > .5 else 0
        if pred == label:
            correct += 1
    print('Test acc: {:.3f} ({:d}/{:d})'.format(correct/n_test, correct, n_test))
        
out_ch = 100
embd_size = 64
batch_size = 32
n_epoch = 10
filter_variations = [[1], [1,2], [1,2,3,4]]
for fil in filter_variations:
    print('filter', fil)
    model = Net(vocab_size, embd_size, out_ch, fil)
    # print(model)
    model, losses = train(model, train_data, batch_size, n_epoch)
    test(model, test_data, len(test_data), max(fil))

filter [1]
epoch: 0, loss: 212.266
epoch: 1, loss: 200.748
epoch: 2, loss: 197.840
epoch: 3, loss: 192.947
epoch: 4, loss: 188.110
epoch: 5, loss: 176.461
epoch: 6, loss: 163.787
epoch: 7, loss: 156.926
epoch: 8, loss: 136.554
epoch: 9, loss: 128.524
Training avg loss: 175.416
Test acc: 0.679 (1448/2133)
filter [1, 2]
epoch: 0, loss: 257.047
epoch: 1, loss: 240.418
epoch: 2, loss: 228.181
epoch: 3, loss: 218.170
epoch: 4, loss: 197.908
epoch: 5, loss: 171.740
epoch: 6, loss: 144.759
epoch: 7, loss: 126.194
epoch: 8, loss: 104.050
epoch: 9, loss: 93.226
Training avg loss: 178.169
Test acc: 0.681 (1452/2133)
filter [1, 2, 3, 4]
epoch: 0, loss: 445.860
epoch: 1, loss: 348.354
epoch: 2, loss: 276.306
epoch: 3, loss: 262.734
epoch: 4, loss: 229.092
epoch: 5, loss: 196.226
epoch: 6, loss: 157.506
epoch: 7, loss: 133.629
epoch: 8, loss: 120.108
epoch: 9, loss: 113.173
Training avg loss: 228.299
Test acc: 0.686 (1464/2133)
