## Import libraries & GPU setting

In [1]:
!echo $LD_LIBRARY_PATH

/usr/local/cuda/lib64:/usr/local/lib:/usr/lib64


In [2]:
# Check GPU
!nvidia-smi

Wed Jun 21 17:55:36 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 367.48                 Driver Version: 367.48                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GT 610      Off  | 0000:01:00.0     N/A |                  N/A |
| 40%   49C    P0    N/A /  N/A |      0MiB /   963MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN X (Pascal)    Off  | 0000:02:00.0     Off |                  N/A |
| 25%   46C    P0    58W / 250W |      0MiB / 12189MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN X (Pascal)    Off  | 0000:03:00.0     Off |                  N/

In [3]:
# GPU setting
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"

In [4]:
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [5]:
import matplotlib

import numpy as np
import collections
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.init import xavier_normal, constant

from sklearn.metrics.pairwise import cosine_similarity

In [6]:
cuda_available = torch.cuda.is_available()
print(cuda_available)

True


In [7]:
torch.manual_seed(1)
if cuda_available:
    torch.cuda.manual_seed(1)

## Parameters

In [8]:
# Set parameters
context_size = 7
embed_size = 300

x_max = 50
alpha = 0.75
batch_size = 2000
l_rate = 0.001
num_epochs = 3

## Dataset

In [9]:
from six.moves.urllib import request
host = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'
for name in ["train", "valid", "test"]:
    request.urlretrieve(
        "{0}/ptb.{1}.txt".format(host,name),
        "ptb.{0}.txt".format(name)
    )

In [10]:
def clean_str(string):
    # Tips for handling string in python : http://agiantmind.tistory.com/31
    string = string.lower()
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [11]:
# https://stackoverflow.com/questions/19130512/stopword-removal-with-nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
word_list = list()
with open('ptb.train.txt') as f:
    for line in f:
        line = clean_str(line)
        for word in line.split():
            if word not in stop:
                word_list.append(word)

[nltk_data] Downloading package stopwords to /home/kdrl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# This step might take time
vocab = np.unique(word_list)
word_to_index = {word: index for index, word in enumerate(vocab)}
index_to_word = {index: word for index, word in enumerate(vocab)}
word_list_size = len(word_list)
vocab_size = len(vocab)
print("word_list_size", word_list_size)
print("vocab_size", vocab_size)

word_list_size 555226
vocab_size 9519


In [13]:
# Construct co-occurence matrix : This won't be generated if vocab_size is too big due to memory problem.
co_occurence_matrix = np.zeros((vocab_size, vocab_size))
for i in range(word_list_size):
    for j in range(1, context_size + 1):
        index = word_to_index[word_list[i]]
        if i-j > 0:
            left_index = word_to_index[word_list[i-j]]
            co_occurence_matrix[index, left_index] += 1.0/j
            co_occurence_matrix[left_index, index] += 1.0/j
        if i+j < word_list_size:
            right_index = word_to_index[word_list[i+j]]
            co_occurence_matrix[index, right_index] += 1.0/j
            co_occurence_matrix[right_index, index] += 1.0/j

In [14]:
# co_oc Matrix is shifted in order to prevent having log(0)
co_occurence_matrix = co_occurence_matrix + 1.0
[num_classes, _] = co_occurence_matrix.shape

## Model

In [15]:
class GloVe(nn.Module):
    def __init__(self, num_classes, embed_size):

        super(GloVe, self).__init__()

        self.num_classes = num_classes
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.in_embed.weight = xavier_normal(self.in_embed.weight)

        self.in_bias = nn.Embedding(self.num_classes, 1)
        self.in_bias.weight = xavier_normal(self.in_bias.weight)

        self.out_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)

        self.out_bias = nn.Embedding(self.num_classes, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)

    def forward(self, word_u, word_v):

        word_u_embed = self.in_embed(word_u)
        word_u_bias = self.in_bias(word_u)
        word_v_embed = self.out_embed(word_v)
        word_v_bias = self.out_bias(word_v)
        
        return ((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1)
    
    def embeddings(self):
        return self.in_embed.weight.data.cpu().numpy() + self.out_embed.weight.data.cpu().numpy()

In [16]:
glove = GloVe(num_classes, embed_size)
if cuda_available:
    # glove = glove.cuda() # one gpu
    glove = torch.nn.DataParallel(glove, device_ids=[0,1,2,3]).cuda() # we set gpu in 1,2,3,4 slot. And it will be counted here with index 0 to 3.
glove

DataParallel (
  (module): GloVe (
    (in_embed): Embedding(9519, 300)
    (in_bias): Embedding(9519, 1)
    (out_embed): Embedding(9519, 300)
    (out_bias): Embedding(9519, 1)
  )
)

In [17]:
for p in glove.parameters():
    print(p.size())

torch.Size([9519, 300])
torch.Size([9519, 1])
torch.Size([9519, 300])
torch.Size([9519, 1])


In [18]:
optimizer = optim.Adam(glove.parameters(), l_rate)

In [19]:
def weight_func(x):
    return 1 if x > x_max else (x / x_max) ** alpha

In [20]:
# https://discuss.pytorch.org/t/operation-between-tensor-and-variable/1286/4
def next_batch(batch_size,word_u,word_v):

    words_co_occurences = np.array([co_occurence_matrix[word_u[i], word_v[i]] for i in range(batch_size)])
    words_weights = np.array([weight_func(var) for var in words_co_occurences])
    
    words_co_occurences = Variable(torch.from_numpy(words_co_occurences).cuda()).float()
    words_weights = Variable(torch.from_numpy(words_weights).cuda()).float()

    word_u = Variable(torch.from_numpy(word_u).cuda())
    word_v = Variable(torch.from_numpy(word_v).cuda())

    return word_u, word_v, words_co_occurences, words_weights

In [21]:
def most_similar(word_embeddings_array, word, result_num = 1):
    data = []
    num = word_embeddings_array.shape[0]
    target_index = word_to_index[word]
    for i in range(num):
        if i != target_index:
            data.append((index_to_word[i],cosine_similarity([word_embeddings_array[target_index]],[word_embeddings_array[i]])[0][0]))
    data.sort(key=lambda tup: tup[1], reverse=True)
    return data[:result_num]

## Train model

In [22]:
for epoch in range(num_epochs):
    # 각 에폭 당 한단어에 대해서 batch_size만큼의 짝을 찾아 batch_size번 업데이트함
    losses = []
    
    random_word_u_indexes = np.random.permutation(num_classes)
        
    for word_u_position in range(0, num_classes, batch_size):
        
        word_u = random_word_u_indexes[word_u_position:(word_u_position + batch_size) if (word_u_position+batch_size) < num_classes else -1]
        cycle_size = word_u.shape[0]
        
        random_word_v_indexes = np.random.permutation(num_classes)
        
        for word_v_position in range(0, num_classes, cycle_size):
            
            word_v = random_word_v_indexes[word_v_position:(word_v_position + cycle_size) if (word_v_position+cycle_size) < num_classes else -1]
            
            if cycle_size != word_v.shape[0]:
                continue
            
            word_u_variable, word_v_variable, words_co_occurences, words_weights = next_batch(cycle_size, word_u, word_v)
            forward_output = glove(word_u_variable, word_v_variable)
            
#             loss = sum([torch.mul((torch.dot(word_u_embed[i], word_v_embed[i]) +
#                     word_u_bias[i] + word_v_bias[i] - np.log(words_co_occurences[i]))**2,
#                     words_weights[i]) for i in range(cycle_size)])
            
#             print(type(((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences)))
#             print(((((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences)) * words_weights).size())
#             A = ((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences)
#             print(forward_output.size())
            loss = (torch.pow((forward_output - torch.log(words_co_occurences)), 2) * words_weights).sum()
#             print(loss.size())
            losses.append(loss.data[0])
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
    print('Train Epoch: {} \t Loss: {:.6f}'.format(epoch + 1, np.mean(losses)))
    print("Similar words for 'programming' : ", most_similar(glove.module.embeddings(),"programming",result_num=5))
    torch.save(glove.module.state_dict(), "./glove.model")

Train Epoch: 1 	 Loss: 10.318272
Similar words for 'programming' :  [('burned', 0.23667076), ('plane', 0.23507056), ('repairs', 0.19748446), ('hk', 0.19657624), ('plc', 0.19187272)]
Train Epoch: 2 	 Loss: 15.368985
Similar words for 'programming' :  [('plane', 0.23771545), ('burned', 0.23043594), ('repairs', 0.22823566), ('lsi', 0.19684684), ('limited', 0.19132639)]
Train Epoch: 3 	 Loss: 11.808201
Similar words for 'programming' :  [('bankamerica', 0.2707361), ('limited', 0.20636576), ('burned', 0.20176192), ('repairs', 0.2016439), ('maybe', 0.19468692)]


## Utilize model

In [28]:
glove2 = GloVe(num_classes, embed_size)
glove2.load_state_dict(torch.load("./glove.model"))
most_similar(glove2.embeddings(),"programming",result_num=5)

[('bankamerica', 0.2707361),
 ('limited', 0.20636576),
 ('burned', 0.20176192),
 ('repairs', 0.2016439),
 ('maybe', 0.19468692)]