In [6]:
from encoder import Encoder
import torch
import numpy as np
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from nltk.tokenize import sent_tokenize as ST

In [7]:
W2V_PATH = "/home/jingjing/Desktop/InferSent-master/dataset/GloVe/glove.840B.300d.txt"

In [8]:
paragraphs = ["Adult education is essential for Democracy of India. The number of grown up illiterates is great. All college and senior School students should come forward to visit villages in the summer vacation. Each one will teach one there. This will remove illiteracy and strengthen our democracy.",
             "I saw a man climbing down a water pipe. He had a knife in his hand. I hit his hand with a brick. He fell down on the ground and I jumped upon him. Soon others reached there and we handed him over to the police."]

In [9]:
f = Encoder()
sentences = ['The Moon is filled wit craters.', 'It has no light of its own.', 'It gets its light from the Sun.']


data = [['Memories of childhood are unforgettable.', 'I was four years old when my grandfather died.',
             'I clearly remember how everybody in the house was weeping.'], ['Today is sunny', 'We should go out for a picnic.', 'Love the weather.']]

for p in paragraphs:
    data.append(ST(p))

print(data)
f.zero_grad()

f.set_w2v_path(W2V_PATH)

[['Memories of childhood are unforgettable.', 'I was four years old when my grandfather died.', 'I clearly remember how everybody in the house was weeping.'], ['Today is sunny', 'We should go out for a picnic.', 'Love the weather.'], ['Adult education is essential for Democracy of India.', 'The number of grown up illiterates is great.', 'All college and senior School students should come forward to visit villages in the summer vacation.', 'Each one will teach one there.', 'This will remove illiteracy and strengthen our democracy.'], ['I saw a man climbing down a water pipe.', 'He had a knife in his hand.', 'I hit his hand with a brick.', 'He fell down on the ground and I jumped upon him.', 'Soon others reached there and we handed him over to the police.']]


In [10]:
def make_target(context_size, dim):
    targets = np.zeros((dim, dim))
    ctxt_sent_pos = list(range(-context_size, context_size+1))
    ctxt_sent_pos.remove(0)
    for ctxt in ctxt_sent_pos:
        targets += np.eye(dim, k=ctxt)
    targets_sum = np.sum(targets,axis=1, keepdims=True)
    targets = targets / targets_sum
    targets = torch.from_numpy(targets)
    return targets

In [16]:
def loss_fn(pred, target):
    m = nn.Softmax(dim=-1)
    s_pred = m(pred)
    losses = F.binary_cross_entropy_with_logits(s_pred, target)
    return losses

In [17]:
with torch.no_grad():
    f.build_vocab(sentences, True)
    embeddings = f.encode(sentences, len(sentences))
    targets = make_target(1, len(sentences))
    loss = loss_fn(embeddings, targets.float())
    print("loss before training: ", loss)

Found 20(/20) words with w2v vectors
Vocab size : 20
loss before training:  tensor(6.0987)


In [18]:
optimizer = optim.Adam(f.parameters(), lr=0.0005)

In [19]:
for epoch in range(20):
    for instance in data:
        optimizer.zero_grad()
        if epoch==0:
            f.build_vocab(instance, True)
        targets = make_target(1, len(instance))
        scores = f.encode(instance, len(instance))
        loss = loss_fn(scores, targets.float())
        print(loss)
        loss.backward()
        optimizer.step()

Found 25(/25) words with w2v vectors
Vocab size : 25
tensor(6.0987)
Found 16(/16) words with w2v vectors
Vocab size : 16
tensor(6.0987)
Found 44(/44) words with w2v vectors
Vocab size : 44
tensor(18.9293)
Found 37(/37) words with w2v vectors
Vocab size : 37
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.8995)
tensor(18.9293)
tensor(6.3744)
tensor(6.0988)
tensor(18.7983)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.1215)
tensor(6.0987)
tensor(18.9292)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293)
tensor(18.9293)
tensor(6.0987)
tensor(6.0987)
tensor(18.9293

In [20]:
with torch.no_grad():
    f.build_vocab(sentences, True)
    embeddings = f.encode(sentences, len(sentences))
    targets = make_target(1, len(sentences))
    loss = loss_fn(embeddings, targets.float())
    print("loss after training: ", loss)

Found 20(/20) words with w2v vectors
Vocab size : 20
loss after training:  tensor(6.0987)
