In [94]:
import codecs
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

use_cuda = torch.cuda.is_available()

In [7]:
def load_data(fpath, label):
    data = []
    with codecs.open(fpath, 'r', 'utf-8', errors='ignore') as f:
        lines = f.readlines()
        for l in lines:
            l = l.rstrip()
            data.append((l.split(' '), label))
    return data
pos = load_data('./dataset/rt-polaritydata/rt-polarity.pos', 1)
neg = load_data('./dataset/rt-polaritydata/rt-polarity.neg', 0)
data = pos + neg

In [9]:
sentence_maxlen = max(map(len, (d for d, _ in data)))

print('sentence maxlen', sentence_maxlen)
vocab = []
for d, _ in data:
    for w in d:
        if w not in vocab: vocab.append(w)
vocab = sorted(vocab)
vocab_size = len(vocab)
print('vocab examples:', vocab[:10])

print('vocab size', len(vocab))
w2i = {w:i for i,w in enumerate(vocab)}
i2w = {i:w for i,w in enumerate(vocab)}
w2i['character']

sentence maxlen 60
vocab examples: ['', '!', '"', '#3', '#9', '$1', '$100', '$20', '$40', '$50-million']
vocab size 21384


3550

In [63]:
class Net(nn.Module):
    def __init__(self, vocab_size, embd_size, out_chs, filter_heights=[1]):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embd_size)
        self.conv = nn.ModuleList([nn.Conv2d(1, out_chs, (fh, embd_size)) for fh in filter_heights])
        # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        # Input: (N,Cin,Lin)
        self.dropout = nn.Dropout(.8)
        self.fc1 = nn.Linear(out_chs*len(filter_heights), 1)
        
    def forward(self, x):
        x = self.embedding(x) # (N, W, embd_dim)
#         print('embd:', x.size())
        x = x.unsqueeze(1) # (N, Ci, W, embd_dim)
#         print('after unsq (add Ci)', x.size())
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
#         print('after cat each conv', x.size())
#         x = self.dropout(x)
        x = self.fc1(x)
#         print('befo sigmodi', x.size())
        probs = F.sigmoid(x)
        return probs


In [97]:
max_sentence_len = 5
def train(model):
    if use_cuda:
        model.cuda()
    losses = []
    for epoch in range(10):
        print('epoch:', epoch)
        epoch_loss = 0.0
        bs = 32
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        random.shuffle(data)
#         for sentence, label in data:
        for i in range(0, len(data)-bs, bs):
            in_data, labels = [], []
            for sentence, label in data[i: i+bs]:
                index_vec = [w2i[w] for w in sentence]
                pad_len = max(0, max_sentence_len - len(index_vec))
                index_vec += [0] * pad_len
                index_vec = index_vec[:max_sentence_len] ## TBD for same len
                in_data.append(index_vec)
                labels.append(label)
#             print('input len:', len(index_vec))
            sent_var = Variable(torch.LongTensor(in_data))
            if use_cuda: sent_var = sent_var.cuda()

            target_var = Variable(torch.Tensor(labels))
            if use_cuda: target_var = target_var.cuda()
            
            optimizer.zero_grad()
            probs = model(sent_var)
            loss = F.binary_cross_entropy(probs, target_var)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.data[0]
        print('loss:', epoch_loss)
        losses.append(epoch_loss)
        
    return model, losses
        
out_ch = 100
embd_size = 64
model = Net(vocab_size, embd_size, out_ch)
print(model)
model, losses = train(model)
print(losses)

Net (
  (embedding): Embedding(21384, 64)
  (conv): ModuleList (
    (0): Conv2d(1, 100, kernel_size=(1, 64), stride=(1, 1))
  )
  (dropout): Dropout (p = 0.8)
  (fc1): Linear (100 -> 1)
)
epoch: 0


  "Please ensure they have the same size.".format(target.size(), input.size()))


loss: 225.07881835103035
epoch: 1
loss: 176.28074941039085
epoch: 2
loss: 137.1305878609419
epoch: 3
loss: 112.6871062591672
epoch: 4
loss: 96.55870608985424
epoch: 5
loss: 81.01027216762304
epoch: 6
loss: 71.85481908358634
epoch: 7
loss: 62.37454151362181
epoch: 8
loss: 55.18156692944467
epoch: 9
loss: 51.54497100180015
[225.07881835103035, 176.28074941039085, 137.1305878609419, 112.6871062591672, 96.55870608985424, 81.01027216762304, 71.85481908358634, 62.37454151362181, 55.18156692944467, 51.54497100180015]
