In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import codecs
from collections import defaultdict
import logging
from operator import itemgetter, methodcaller
import random

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [3]:
ptb_train = "../ptb.train.txt"
ptb_valid = "../ptb.valid.txt"
ptb_test = "../ptb.test.txt"

In [4]:
class Vocabulary(object):
    def __init__(self, filepath, freq):
        self.unk_word = "<unk>"
        self.unk_id = 0
        self.word2idx = {}
        self.idx2word = {}
        self.freq = freq
        self.text = filepath
        self.word2idx[self.unk_word] = self.unk_id
        self.idx2word[self.unk_id] = self.unk_word
    
    def buildDict(self):
        logger.info("Start extract words")
        counter = defaultdict(int)
        for line in codecs.open(self.text, 'r', encoding='utf-8'):
            for word in line.strip().split():
                counter[word] += 1
        logger.info("Collect word finished")
        
        wordFreqList = [(word, cnt) for word, cnt in counter.items()
                       if cnt > self.freq]
        sortedWordFreqList = sorted(wordFreqList, key = itemgetter(1), reverse=True)
        for word, freq in sortedWordFreqList:
            if word not in self.word2idx:
                wordIndex = len(self.word2idx)
                self.word2idx[word] = wordIndex
                self.idx2word[wordIndex] = word
        logger.info("Vocabulary building finished")
    
    def toIdx(self, word):
        if word in self.word2idx:
            return self.word2idx[word]
        else:
            logger.warning("unknown word {}".format(word))
            return self.unk_id
    
    def toWord(self, idx):
        if isinstance(idx, int):
            if idx in self.idx2word:
                return self.idx2word[idx]
            else:
                logger.warning("idx {} not found".format(idx))

In [5]:
vocab = Vocabulary(ptb_train, 0)
vocab.buildDict()
display(len(vocab.word2idx))

I0719 094734 <ipython-input-4-071325c9a8da>:13] Start extract words
I0719 094734 <ipython-input-4-071325c9a8da>:18] Collect word finished
I0719 094734 <ipython-input-4-071325c9a8da>:28] Vocabulary building finished


9999

In [6]:
class CBOWDataSet(object):
    def __init__(self, filepath, vocab, window_size):
        self.window_size = window_size
        self.filepath = filepath
        self.vocab = vocab
    
    def buildData(self):
        self.datasets = []
        for line in codecs.open(self.filepath, 'r', 'utf-8'):
            words = line.strip().split()
            
            wordPos2Idx = {}
            for wordPos, word in enumerate(words):
                wordPos2Idx[wordPos] = self.vocab.toIdx(word)

            for wordPos in range(len(words)):
                features = []
                label = wordPos2Idx[wordPos]
                for offset in range(-self.window_size, self.window_size+1):
                    if offset == 0:
                        continue
                    realWordPos = wordPos + offset
                    if (realWordPos < 0) or (realWordPos >= len(words)):
                        continue
                    features.append(wordPos2Idx[wordPos+offset])
                if len(features) > 0:
                    self.datasets.append((features, label))
                if len(self.datasets) % 100000 == 0:
                    logger.info("Datasets has {} samples".format(len(self.datasets)))
    
    def getDataSet(self):
        return self.datasets

In [7]:
trainData = CBOWDataSet(ptb_train, vocab, 2)
trainData.buildData()
display(len(trainData.getDataSet()))
validData = CBOWDataSet(ptb_valid, vocab, 2)
validData.buildData()
display(len(validData.getDataSet()))
testData = CBOWDataSet(ptb_test, vocab, 2)
testData.buildData()
display(len(testData.getDataSet()))

I0719 094748 <ipython-input-6-085db318dc5d>:29] Datasets has 100000 samples
I0719 094748 <ipython-input-6-085db318dc5d>:29] Datasets has 200000 samples
I0719 094749 <ipython-input-6-085db318dc5d>:29] Datasets has 300000 samples
I0719 094749 <ipython-input-6-085db318dc5d>:29] Datasets has 400000 samples
I0719 094750 <ipython-input-6-085db318dc5d>:29] Datasets has 500000 samples
I0719 094751 <ipython-input-6-085db318dc5d>:29] Datasets has 600000 samples
I0719 094751 <ipython-input-6-085db318dc5d>:29] Datasets has 700000 samples
I0719 094752 <ipython-input-6-085db318dc5d>:29] Datasets has 800000 samples


887384

70377

78664

In [8]:
trainData.getDataSet()[0:10]

[([9970, 9971], 9969),
 ([9969, 9971, 9972], 9970),
 ([9969, 9970, 9972, 9973], 9971),
 ([9970, 9971, 9973, 9974], 9972),
 ([9971, 9972, 9974, 9975], 9973),
 ([9972, 9973, 9975, 9976], 9974),
 ([9973, 9974, 9976, 9977], 9975),
 ([9974, 9975, 9977, 9978], 9976),
 ([9975, 9976, 9978, 9979], 9977),
 ([9976, 9977, 9979, 9980], 9978)]

In [33]:
class CBOWModel(nn.Module):
    def __init__(self, embed_size, vocab_size):
        super(CBOWModel, self).__init__()
        self.embed = nn.EmbeddingBag(num_embeddings=vocab_size, embedding_dim=embed_size, mode='sum')
        self.project = nn.Linear(in_features=embed_size, out_features=vocab_size)
        self.offsets = torch.LongTensor([0])
    
    def forward(self, context):
        context_tensor = torch.LongTensor(context)
        return F.log_softmax(self.project(self.embed(context_tensor, self.offsets)))

In [53]:
def evaluate_ppl(model, dataset):
    dataSize = len(dataset)
    log_val = 0
    with torch.no_grad():
        for item in dataset:
            predict = model(item[0])
            log_val += predict[0, item[1]]
        print("negative log-likehood {} / ppl {}".format(log_val, torch.exp(-1.0*log_val/dataSize)))

In [62]:
model = CBOWModel(16, len(vocab.word2idx))
optimizer = optim.SGD(model.parameters(), lr=1e-3)
loss = nn.NLLLoss()

In [None]:
epochs = 10
iter = 0
logger.info("Evaluation without any training")
evaluate_ppl(model, validData.getDataSet())
for i in range(10):
    logger.info("Start of {} epochs".format(i))
    data = trainData.getDataSet()
    indexes = list(range(len(data)))
    random.shuffle(indexes)    
    for idx in indexes:
        iter += 1
        if iter%100000 == 0:
            logger.info("Iteration {}".format(iter))
            logger.info("Performance on evaluation data")
            evaluate_ppl(model, validData.getDataSet())
            logger.info("Performance on test data")
            evaluate_ppl(model, testData.getDataSet())
        optimizer.zero_grad()
        label = torch.LongTensor([data[idx][1]])
        idx_loss = loss(model(data[idx][0]), label)
        idx_loss.backward()
        optimizer.step()
    
    evaluate_ppl(model, validData.getDataSet())

I0720 095829 <ipython-input-63-7d3482511f50>:3] Evaluation without any training
I0720 095847 <ipython-input-63-7d3482511f50>:6] Start of 0 epochs


negative log-likehood -697692.3125 / ppl 20204.103515625


I0720 101011 <ipython-input-63-7d3482511f50>:13] Iteration 100000
I0720 101011 <ipython-input-63-7d3482511f50>:14] Performance on evaluation data
I0720 101028 <ipython-input-63-7d3482511f50>:16] Performance on test data


negative log-likehood -560200.0625 / ppl 2864.03759765625
negative log-likehood -622617.5 / ppl 2737.765625


I0720 102233 <ipython-input-63-7d3482511f50>:13] Iteration 200000
I0720 102233 <ipython-input-63-7d3482511f50>:14] Performance on evaluation data
I0720 102252 <ipython-input-63-7d3482511f50>:16] Performance on test data


negative log-likehood -527640.5 / ppl 1803.2449951171875
negative log-likehood -586004.0625 / ppl 1718.928955078125


I0720 103358 <ipython-input-63-7d3482511f50>:13] Iteration 300000
I0720 103358 <ipython-input-63-7d3482511f50>:14] Performance on evaluation data
I0720 103415 <ipython-input-63-7d3482511f50>:16] Performance on test data


negative log-likehood -509649.59375 / ppl 1396.4754638671875
negative log-likehood -565990.0 / ppl 1332.7930908203125


In [64]:
evaluate_ppl(model, testData.getDataSet())

negative log-likehood -474286.5625 / ppl 415.41192626953125
