In [47]:
from encoder import Encoder
import torch
import numpy as np
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from nltk.tokenize import sent_tokenize as ST

In [48]:
W2V_PATH = "/home/jingjing/Desktop/InferSent-master/dataset/GloVe/glove.840B.300d.txt"

In [49]:
paragraphs = ["Adult education is essential for Democracy of India. The number of grown up illiterates is great. All college and senior School students should come forward to visit villages in the summer vacation. Each one will teach one there. This will remove illiteracy and strengthen our democracy.",
             "I saw a man climbing down a water pipe. He had a knife in his hand. I hit his hand with a brick. He fell down on the ground and I jumped upon him. Soon others reached there and we handed him over to the police."]

In [50]:
f = Encoder()
sentences = ['The Moon is filled wit craters.', 'It has no light of its own.', 'It gets its light from the Sun.']


data = []
for p in paragraphs:
    data.append(ST(p))

print(data)
f.zero_grad()

f.set_w2v_path(W2V_PATH)

[['Adult education is essential for Democracy of India.', 'The number of grown up illiterates is great.', 'All college and senior School students should come forward to visit villages in the summer vacation.', 'Each one will teach one there.', 'This will remove illiteracy and strengthen our democracy.'], ['I saw a man climbing down a water pipe.', 'He had a knife in his hand.', 'I hit his hand with a brick.', 'He fell down on the ground and I jumped upon him.', 'Soon others reached there and we handed him over to the police.']]


In [51]:
def make_target(context_size, dim):
    targets = np.zeros((dim, dim))
    ctxt_sent_pos = list(range(-context_size, context_size+1))
    ctxt_sent_pos.remove(0)
    for ctxt in ctxt_sent_pos:
        targets += np.eye(dim, k=ctxt)
    targets_sum = np.sum(targets,axis=1, keepdims=True)
    targets = targets / targets_sum
    targets = torch.from_numpy(targets)
    return targets

In [52]:
def loss_fn(pred, target):
    mask = 1 - torch.diag(torch.ones(pred.size(1)))
    npred = pred * mask
    s_pred = F.softmax(npred, -1)
    losses = F.binary_cross_entropy_with_logits(s_pred, target, reduce=True)
    return losses

In [53]:
with torch.no_grad():
    f.build_vocab(sentences, True)
    embeddings = f.encode(sentences, len(sentences))
    targets = make_target(1, len(sentences))
    loss = loss_fn(embeddings, targets.float())
    print("loss before training: ", loss)

Found 20(/20) words with w2v vectors
Vocab size : 20
loss before training:  tensor(0.6965)


In [54]:
optimizer = optim.Adam(f.parameters(), lr=0.1)

In [55]:
for epoch in range(20):
    for instance in data:
        optimizer.zero_grad()
        if epoch == 0:
            f.build_vocab(instance, True)
        targets = make_target(1, len(instance))
        scores = f.encode(instance, len(instance))
        loss = loss_fn(scores, targets.float())
        print(loss)
        loss.backward()
        optimizer.step()

Found 44(/44) words with w2v vectors
Vocab size : 44
tensor(0.7465)
Found 37(/37) words with w2v vectors
Vocab size : 37
tensor(0.7586)
tensor(0.7372)
tensor(0.8172)
tensor(0.7371)
tensor(0.8171)
tensor(0.7372)
tensor(0.8154)
tensor(0.7366)
tensor(0.8168)
tensor(0.7371)
tensor(0.8124)
tensor(0.7368)
tensor(0.7941)
tensor(0.7370)
tensor(0.7948)
tensor(0.7361)
tensor(0.7948)
tensor(0.7372)
tensor(0.7847)
tensor(0.7372)
tensor(0.7825)
tensor(0.7355)
tensor(0.7794)
tensor(0.7348)
tensor(0.7808)
tensor(0.7361)
tensor(0.7814)
tensor(0.7354)
tensor(0.7948)
tensor(0.7372)
tensor(0.7949)
tensor(0.7364)
tensor(0.7948)
tensor(0.7370)
tensor(0.7948)
tensor(0.7371)
tensor(0.7948)
tensor(0.7370)
tensor(0.7946)


In [36]:
with torch.no_grad():
    f.build_vocab(sentences, True)
    embeddings = f.encode(sentences, len(sentences))
    targets = make_target(1, len(sentences))
    loss = loss_fn(embeddings, targets.float())
    print("loss after training: ", loss)

Found 20(/20) words with w2v vectors
Vocab size : 20
here tensor([[   0.0000,  843.1820,  830.8679],
        [ 843.1820,    0.0000,  842.6080],
        [ 830.8679,  842.6080,    0.0000]])
loss after training:  tensor(0.6161)
