## GPU setting

In [1]:
!echo $LD_LIBRARY_PATH

/usr/local/cuda/lib64:/usr/local/lib:/usr/lib64


In [2]:
# Check GPU
!nvidia-smi

Wed Jun 21 14:03:14 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 367.48                 Driver Version: 367.48                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GT 610      Off  | 0000:01:00.0     N/A |                  N/A |
| 40%   44C    P8    N/A /  N/A |      0MiB /   963MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN X (Pascal)    Off  | 0000:02:00.0     Off |                  N/A |
| 26%   42C    P8    17W / 250W |      0MiB / 12189MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN X (Pascal)    Off  | 0000:03:00.0     Off |                  N/

In [3]:
# GPU setting
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"

In [4]:
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [5]:
import matplotlib

import numpy as np
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.tokenize import word_tokenize
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.init import xavier_normal, constant

from sklearn.metrics.pairwise import cosine_similarity

In [6]:
cuda_available = torch.cuda.is_available()
print(cuda_available)

True


In [7]:
torch.manual_seed(1)
if cuda_available:
    torch.cuda.manual_seed(1)

## Parameters

In [8]:
# Set parameters
context_size = 5
embed_size = 1000

x_max = 10
alpha = 0.75
batch_size = 1000
l_rate = 0.001
num_epochs = 10

## Dataset

In [9]:
categories = ['comp.sys.ibm.pc.hardware']
raw_data = fetch_20newsgroups(subset = "train", categories = categories)
print(list(raw_data.target_names))

['comp.sys.ibm.pc.hardware']


In [10]:
data_num = len(raw_data.data)
print("data_num",data_num)

data_num 590


## Create vocabulary and word lists

In [11]:
def clean_str(string):
    # Original : https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    # Tips for handling string in python : http://agiantmind.tistory.com/31
    string = string.lower()
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()
clean_str(raw_data.data[0])

"from badry cs ualberta ca ( badry jason theodore ) subject chaining ide drives summary trouble with master slave drives nntp posting host cab009 cs ualberta ca organization university of alberta , edmonton canada lines 16 hi i am trying to set up a conner 3184 and a quantum 80at drive i have the conner set to the master , and the quantum set to the slave ( does n't work the other way around ) i am able to access both drives if i boot from a floppy , but the drives will not boot themselves i am running msdos 6 , and have the conner partitioned as primary dos , and is formatted with system files i have tried all different types of setups , and even changed ide controller cards if i boot from a floppy , everything works great ( except the booting part ) ) the system does n't report an error message or anything , just hangs there does anyone have any suggestions , or has somebody else run into a similar problem ? i was thinking that i might have to update the bios on one of the drives ( i

In [12]:
# Below line is needed to use nltk tokenizer
# https://stackoverflow.com/questions/37101114/what-to-download-in-order-to-make-nltk-tokenize-word-tokenize-work
# nltk.download("punkt")
print("All documents will be tokenized like -> ",*word_tokenize(clean_str(raw_data.data[0]))[:5],"...")

All documents will be tokenized like ->  from badry cs ualberta ca ...


In [13]:
# This step might take time
word_list = [word for data in raw_data.data for word in word_tokenize(clean_str(data))]
vocab = np.unique(word_list)
word_to_index = {word: index for index, word in enumerate(vocab)}
index_to_word = {index: word for index, word in enumerate(vocab)}
word_list_size = len(word_list)
vocab_size = len(vocab)
print("word_list_size", word_list_size)
print("vocab_size", vocab_size)

word_list_size 145379
vocab_size 11703


In [14]:
# Construct co-occurence matrix : This won't be generated if vocab_size is too big due to memory problem.
co_occurence_matrix = np.zeros((vocab_size, vocab_size))
for i in range(word_list_size):
    for j in range(1, context_size + 1):
        index = word_to_index[word_list[i]]
        if i-j > 0:
            left_index = word_to_index[word_list[i-j]]
            co_occurence_matrix[index, left_index] += 1.0/j
        if i+j < word_list_size:
            right_index = word_to_index[word_list[i+j]]
            co_occurence_matrix[index, right_index] += 1.0/j

In [15]:
# co_oc Matrix is shifted in order to prevent having log(0)
co_occurence_matrix = co_occurence_matrix + 1.0

[num_classes, _] = co_occurence_matrix.shape

In [16]:
class GloVe(nn.Module):
    def __init__(self, num_classes, embed_size):

        super(GloVe, self).__init__()

        self.num_classes = num_classes
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.in_embed.weight = xavier_normal(self.in_embed.weight)

        self.in_bias = nn.Embedding(self.num_classes, 1)
        self.in_bias.weight = xavier_normal(self.in_bias.weight)

        self.out_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)

        self.out_bias = nn.Embedding(self.num_classes, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)

    def forward(self, word_u, word_v):

        word_u_embed = self.in_embed(word_u)
        word_u_bias = self.in_bias(word_u)
        word_v_embed = self.out_embed(word_v)
        word_v_bias = self.out_bias(word_v)
        
        return ((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1)
    
    def embeddings(self):
        return self.in_embed.weight.data.cpu().numpy() + self.out_embed.weight.data.cpu().numpy()

In [17]:
glove = GloVe(num_classes, embed_size)
if cuda_available:
    # glove = glove.cuda() # one gpu
    glove = torch.nn.DataParallel(glove, device_ids=[0,1,2,3]).cuda() # we set gpu in 1,2,3,4 slot. And it will be counted here with index 0 to 3.
glove

DataParallel (
  (module): GloVe (
    (in_embed): Embedding(11703, 1000)
    (in_bias): Embedding(11703, 1)
    (out_embed): Embedding(11703, 1000)
    (out_bias): Embedding(11703, 1)
  )
)

In [18]:
for p in glove.parameters():
    print(p.size())

torch.Size([11703, 1000])
torch.Size([11703, 1])
torch.Size([11703, 1000])
torch.Size([11703, 1])


In [19]:
optimizer = optim.Adam(glove.parameters(), l_rate)

In [20]:
def weight_func(x):
    return 1 if x > x_max else (x / x_max) ** alpha

In [21]:
# https://discuss.pytorch.org/t/operation-between-tensor-and-variable/1286/4
def next_batch(batch_size,word_u,word_v):

    words_co_occurences = np.array([co_occurence_matrix[word_u[i], word_v[i]] for i in range(batch_size)])
    words_weights = np.array([weight_func(var) for var in words_co_occurences])
    
    words_co_occurences = Variable(torch.from_numpy(words_co_occurences).cuda()).float()
    words_weights = Variable(torch.from_numpy(words_weights).cuda()).float()

    word_u = Variable(torch.from_numpy(word_u).cuda())
    word_v = Variable(torch.from_numpy(word_v).cuda())

    return word_u, word_v, words_co_occurences, words_weights

In [22]:
def most_similar(word_embeddings_array, word, result_num = 1):
    data = []
    num = word_embeddings_array.shape[0]
    target_index = word_to_index[word]
    for i in range(num):
        if i != target_index:
            data.append((index_to_word[i],cosine_similarity([word_embeddings_array[target_index]],[word_embeddings_array[i]])[0][0]))
    data.sort(key=lambda tup: tup[1], reverse=True)
    return data[:result_num]

In [23]:
for epoch in range(num_epochs):
    # 각 에폭 당 한단어에 대해서 batch_size만큼의 짝을 찾아 batch_size번 업데이트함
    losses = []
    
    random_word_u_indexes = np.random.permutation(num_classes)
        
    for word_u_position in range(0, num_classes, batch_size):
        
        word_u = random_word_u_indexes[word_u_position:(word_u_position + batch_size) if (word_u_position+batch_size) < num_classes else -1]
        cycle_size = word_u.shape[0]
        
        random_word_v_indexes = np.random.permutation(num_classes)
        
        for word_v_position in range(0, num_classes, cycle_size):
            
            word_v = random_word_v_indexes[word_v_position:(word_v_position + cycle_size) if (word_v_position+cycle_size) < num_classes else -1]
            
            if cycle_size != word_v.shape[0]:
                continue
            
            word_u_variable, word_v_variable, words_co_occurences, words_weights = next_batch(cycle_size, word_u, word_v)
            forward_output = glove(word_u_variable, word_v_variable)
            
#             loss = sum([torch.mul((torch.dot(word_u_embed[i], word_v_embed[i]) +
#                     word_u_bias[i] + word_v_bias[i] - np.log(words_co_occurences[i]))**2,
#                     words_weights[i]) for i in range(cycle_size)])
            
#             print(type(((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences)))
#             print(((((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences)) * words_weights).size())
#             A = ((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences)
#             print(forward_output.size())
            loss = (torch.pow((forward_output - torch.log(words_co_occurences)), 2) * words_weights).sum()
#             print(loss.size())
            losses.append(loss.data[0])
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
    print('Train Epoch: {} \t Loss: {:.6f}'.format(epoch + 1, np.mean(losses)))
    print("Similar words for 'system' : ", most_similar(glove.module.embeddings(),"system",result_num=5))

Train Epoch: 1 	 Loss: 1.053742
Similar words for 'system' :  [('massachusetts', 0.32696742), ('psu', 0.19651088), ('sharpness', 0.17754231), ('spl', 0.1699675), ('25130', 0.15909022)]
Train Epoch: 2 	 Loss: 1.363482
Similar words for 'system' :  [('cylinders', 0.42983997), ('xt', 0.4211376), ('extended', 0.28729102), ('sb', 0.196087), ('v6', 0.16135693)]
Train Epoch: 3 	 Loss: 1.502517
Similar words for 'system' :  [('tower', 0.45388857), ('midwest', 0.39337867), ('extended', 0.3279146), ('cylinders', 0.32209003), ('9', 0.31404656)]
Train Epoch: 4 	 Loss: 1.308296
Similar words for 'system' :  [('tower', 0.45125547), ('midwest', 0.33681771), ('9', 0.30914477), ('cylinders', 0.28699273), ('xt', 0.27460074)]
Train Epoch: 5 	 Loss: 1.205730
Similar words for 'system' :  [('tower', 0.43317813), ('9', 0.30084562), ('midwest', 0.29741156), ('xt', 0.26172471), ('extended', 0.25178793)]
Train Epoch: 6 	 Loss: 1.200561
Similar words for 'system' :  [('tower', 0.42451388), ('9', 0.25426871), ('