## GPU setting

In [1]:
!echo $LD_LIBRARY_PATH

/usr/local/cuda/lib64:/usr/local/lib:/usr/lib64


In [2]:
# Check GPU
!nvidia-smi

Wed Jun 21 01:20:22 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 367.48                 Driver Version: 367.48                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GT 610      Off  | 0000:01:00.0     N/A |                  N/A |
| 40%   45C    P8    N/A /  N/A |      0MiB /   963MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN X (Pascal)    Off  | 0000:02:00.0     Off |                  N/A |
| 33%   50C    P8    19W / 250W |      0MiB / 12189MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN X (Pascal)    Off  | 0000:03:00.0     Off |                  N/

In [3]:
# GPU setting
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"

In [4]:
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [5]:
import matplotlib

import numpy as np
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.tokenize import word_tokenize
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.init import xavier_normal

In [6]:
cuda_available = torch.cuda.is_available()
print(cuda_available)

True


In [7]:
torch.manual_seed(1)
if cuda_available:
    torch.cuda.manual_seed(1)

## Parameters

In [8]:
# Set parameters
context_size = 4
embed_size = 300

x_max = 100
alpha = 0.75
batch_size = 100
l_rate = 0.001
num_epochs = 20

## Dataset

In [9]:
categories = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware']
raw_data = fetch_20newsgroups(subset = "train", categories = categories)
print(list(raw_data.target_names))

['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']


In [10]:
data_num = len(raw_data.data)
print("data_num",data_num)

data_num 1168


## Create vocabulary and word lists

In [11]:
def clean_str(string):
    # Original : https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    # Tips for handling string in python : http://agiantmind.tistory.com/31
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()
clean_str(raw_data.data[0])

"From randy lynx msc cornell edu UUCP ( Randall Jay Ellingson , 199 Clark , 55915 , ) Subject Re IDE vs SCSI Originator randy msc2 msc cornell edu Organization Cornell Materials Science Center Lines 47 From article 1qq7i1INNdqc dns1 NMSU Edu , by bgrubb dante nmsu edu ( GRUBB ) wlsmith valve heart rri uwo ca ( Wayne Smith ) write In article 1qpu0uINNbt1 dns1 NMSU Edu bgrubb dante nmsu edu ( GRUBB ) writes wlsmith valve heart rri uwo ca ( Wayne Smith ) writes Since the Mac uses ONLY SCSI 1 for hard drives YES the figure includes a hundred for SCSI drivers This is sloppy people and DUMB What group is this ? This is not a MAC group Nice of you to DELETE BOTH YOUR responce and the item that prompted it to whit I just bought at Quantum 240 for my mac at home I paid 369 for it I Tons of stuff deleted on SCSI vs IDE question Wow , you guys are really going wild on this IDE vs SCSI thing , and I think it 's great ! Like lots of people , I 'd really like to increase my data transfer rate from t

In [12]:
# Below line is needed to use nltk tokenizer
# https://stackoverflow.com/questions/37101114/what-to-download-in-order-to-make-nltk-tokenize-word-tokenize-work
# nltk.download("punkt")
print(word_tokenize(clean_str(raw_data.data[0]))[:5])

['From', 'randy', 'lynx', 'msc', 'cornell']


In [13]:
# This step might take time
word_list = [word for data in raw_data.data for word in word_tokenize(clean_str(data))]
vocab = np.unique(word_list)
word_to_index = {word: index for index, word in enumerate(vocab)}
word_list_size = len(word_list)
vocab_size = len(vocab)
print("word_list_size", word_list_size)
print("vocab_size", vocab_size)

word_list_size 272692
vocab_size 21536


In [14]:
# Construct co-occurence matrix : This won't be generated if vocab_size is too big due to memory problem.
co_occurence_matrix = np.zeros((vocab_size, vocab_size))
for i in range(word_list_size):
    for j in range(1, context_size + 1):
        index = word_to_index[word_list[i]]
        if i-j > 0:
            left_index = word_to_index[word_list[i-j]]
            co_occurence_matrix[index, left_index] += 1.0/j
        if i+j < word_list_size:
            right_index = word_to_index[word_list[i+j]]
            co_occurence_matrix[index, right_index] += 1.0/j

In [15]:
''' co_oc Matrix is shifted in order to prevent having log(0) '''
co_occurence_matrix = co_occurence_matrix + 1.0

[num_classes, _] = co_occurence_matrix.shape

In [16]:
class GloVe(nn.Module):
    def __init__(self, num_classes, embed_size):

        super(GloVe, self).__init__()

        self.num_classes = num_classes
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.in_embed.weight = xavier_normal(self.in_embed.weight)

        self.in_bias = nn.Embedding(self.num_classes, 1)
        self.in_bias.weight = xavier_normal(self.in_bias.weight)

        self.out_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)

        self.out_bias = nn.Embedding(self.num_classes, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)

    def forward(self, word_u, word_v):

        word_u_embed = self.in_embed(word_u)
        word_u_bias = self.in_bias(word_u)
        word_v_embed = self.out_embed(word_v)
        word_v_bias = self.out_bias(word_v)
        
        return word_u_embed, word_v_embed, word_u_bias, word_v_bias

    def embeddings(self):
        return self.in_embed.weight.data.cpu().numpy() + self.out_embed.weight.data.cpu().numpy()

In [17]:
glove = GloVe(num_classes, embed_size)
if cuda_available:
    glove = glove.cuda() # one gpu
    # glove = torch.nn.DataParallel(glove, device_ids=[0,1,2,3]).cuda() # we set gpu in 1,2,3,4 slot. And it will be counted here with index 0 to 3.
glove

GloVe (
  (in_embed): Embedding(21536, 300)
  (in_bias): Embedding(21536, 1)
  (out_embed): Embedding(21536, 300)
  (out_bias): Embedding(21536, 1)
)

In [18]:
for p in glove.parameters():
    print(p.size())

torch.Size([21536, 300])
torch.Size([21536, 1])
torch.Size([21536, 300])
torch.Size([21536, 1])


In [19]:
optimizer = optim.Adam(glove.parameters(), l_rate)

In [20]:
def weight_func(x):
    return 1 if x > x_max else (x / x_max) ** alpha

In [21]:
def next_batch(batch_size):
    word_u = np.random.choice(np.arange(num_classes), size=batch_size, replace=False)
    word_v = np.random.choice(np.arange(num_classes), size=batch_size, replace=False)

    words_co_occurences = np.array([co_occurence_matrix[word_u[i], word_v[i]] for i in range(batch_size)])
    words_weights = np.array([weight_func(var) for var in words_co_occurences])

    words_co_occurences = Variable(torch.from_numpy(words_co_occurences).cuda()).float()
    words_weights = Variable(torch.from_numpy(words_weights).cuda()).float()

    word_u = Variable(torch.from_numpy(word_u).cuda())
    word_v = Variable(torch.from_numpy(word_v).cuda())

    return word_u, word_v, words_co_occurences, words_weights

In [22]:
for epoch in range(num_epochs):
    num_batches = int(word_list_size/batch_size)
    percent = 19.9
    for batch_cycle in range(num_batches):
        word_u, word_v, words_co_occurences, words_weights = next_batch(batch_size)
        word_u_embed, word_v_embed, word_u_bias, word_v_bias = glove(word_u, word_v)
        loss = (torch.pow(
            ((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1) - torch.log(words_co_occurences), 2
        ) * words_weights).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if 100. * batch_cycle / num_batches >= percent:
            print('Train Epoch: {} \t progress: {} \t Loss: {:.6f}'.format(epoch, 100. * batch_cycle / num_batches, loss.data[0]))
            percent += 20.0

Train Epoch: 0 	 progress: 40.0 	 Loss: 0.000725
Train Epoch: 0 	 progress: 60.0 	 Loss: 0.026821
Train Epoch: 0 	 progress: 80.0 	 Loss: 0.003946
Train Epoch: 0 	 progress: 100.0 	 Loss: 0.007126
Train Epoch: 1 	 progress: 40.0 	 Loss: 0.010466
Train Epoch: 1 	 progress: 60.0 	 Loss: 0.012776
Train Epoch: 1 	 progress: 80.0 	 Loss: 0.017000
Train Epoch: 1 	 progress: 100.0 	 Loss: 0.020834
Train Epoch: 2 	 progress: 40.0 	 Loss: 0.018568
Train Epoch: 2 	 progress: 60.0 	 Loss: 0.022954
Train Epoch: 2 	 progress: 80.0 	 Loss: 0.024475
Train Epoch: 2 	 progress: 100.0 	 Loss: 0.057465
Train Epoch: 3 	 progress: 40.0 	 Loss: 0.036505
Train Epoch: 3 	 progress: 60.0 	 Loss: 0.038287
Train Epoch: 3 	 progress: 80.0 	 Loss: 0.037493
Train Epoch: 3 	 progress: 100.0 	 Loss: 0.039739
Train Epoch: 4 	 progress: 40.0 	 Loss: 0.053809
Train Epoch: 4 	 progress: 60.0 	 Loss: 0.048651
Train Epoch: 4 	 progress: 80.0 	 Loss: 0.042766
Train Epoch: 4 	 progress: 100.0 	 Loss: 0.050464
Train Epoch: 5 

KeyboardInterrupt: 

In [None]:
word_embeddings = glove.embeddings()  
word_embeddings.shape