In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np 
import urllib.request
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances


In [3]:
torch.manual_seed(1)
CONTEXT_SIZE = 3
EMBEDDING_DIM = 10

In [4]:
def get_key(word_id):
    for key, val in word_to_ix.items():
        if (val == word_id):
            return key
    return ''

In [5]:
def read_data(file_path, remove_stopwords = False):
    tokenizer = RegexpTokenizer(r'\w+')
    if file_path.lower().startswith('http'):
        data = urllub.request.urlopen(file_path)
        data = data.read().decode('utf8')
    else:
        data = open(file_path, encoding = 'utf8').read()
    tokenized_data = word_tokenize(data)
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
    else:
        stop_words = set([])
        
    stop_words.update(['.',',',':',';','(',')','#','--','...','"'])
    cleaned_words = [ i for i in tokenized_data if i not in stop_words ]
    return cleaned_words

In [6]:
test_sentence = read_data('./nlp_data/word2vec_test.txt')


In [7]:
ngrams = []
for i in range(len(test_sentence) - CONTEXT_SIZE):
    tup = [test_sentence[j] for j in np.arange(i, i + CONTEXT_SIZE)]
    ngrams.append((tup,test_sentence[i + CONTEXT_SIZE]))
print(ngrams[0], ngrams[1])

(['Empathy', 'for', 'the'], 'poor') (['for', 'the', 'poor'], 'may')


In [8]:
vocab = set(test_sentence)
print('單字個數: ', len(vocab))

word_to_ix = {word: i for i, word in enumerate(vocab)}

單字個數:  192


In [9]:
class CBOWModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOWModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out1 = F.relu(self.linear1(embeds))
        out2 = self.linear2(out1)
        log_probs = F.log_softmax(out2, dim = 1)
        return log_probs
    
    def predict(self, input):
        context_idxs = torch.LongTensor([word_to_ix[w] for w in input])
        res = self.forward(context_idxs)
        res_arg = torch.argmax(res)
        res_val, res_ind = res.sort(descending = True)
        res_val = res_val[0][:3]
        res_ind = res_ind[0][:3]
        for arg in zip(res_val, res_ind):
            print([(key, val, arg[0]) for key, val in word_to_ix.items() if val == arg[1]])

In [10]:
losses = []
loss_function = nn.NLLLoss()
model = CBOWModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr = 0.001)
for epoch in range(400):
    total_loss = 0
    for context, target in ngrams:
        context_idxs = torch.LongTensor([word_to_ix[w] for w in context])
        
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.LongTensor([word_to_ix[target]]))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)

In [11]:
model.predict(['of','all','human'])


[('afflictions', 127, tensor(-0.0418, grad_fn=<UnbindBackward0>))]
[('health', 168, tensor(-3.9975, grad_fn=<UnbindBackward0>))]
[('book', 82, tensor(-5.4046, grad_fn=<UnbindBackward0>))]


In [12]:
ngrams = []
for i in range(len(test_sentence) - CONTEXT_SIZE):
    tup = [test_sentence[j] for j in np.arange(i + 1, i + CONTEXT_SIZE + 1)]
    ngrams.append((test_sentence[i], tup))
print(ngrams[0], ngrams[1])

('Empathy', ['for', 'the', 'poor']) ('for', ['the', 'poor', 'may'])


In [13]:
class SkipgramModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(SkipgramModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, context_size * vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out1 = F.relu(self.linear1(embeds))
        out2 = self.linear2(out1)
        log_probs = F.log_softmax(out2, dim = 1).view(CONTEXT_SIZE, -1)
        return log_probs
    
    def predict(self, input):
        context_idxs = torch.LongTensor([word_to_ix[input]])
        res = self.forward(context_idxs)
        res_arg = torch.argmax(res)
        res_val, res_ind = res.sort(descending = True)
        indices = [res_ind[i][0] for i in np.arange(0,3)]
        for arg in indices:
            print([(key, val) for key, val in word_to_ix.items() if val == arg])

In [15]:
losses = []
loss_function = nn.NLLLoss()
model = SkipgramModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr = .001)
for epoch in range(550):
    total_loss = 0
    
    for context, target in ngrams:
        context_idxs = torch.LongTensor([word_to_ix[context]])
        model.zero_grad()
        log_probs = model(context_idxs)
        target_list = torch.LongTensor([word_to_ix[w] for w in target])
        loss = loss_function(log_probs, target_list)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    losses.append(total_loss)

In [16]:
model.predict('psychologically')


[('and', 89)]
[('physically', 68)]
[('incapacitating', 132)]
