In [1]:
import matplotlib

import numpy as np
import collections
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.init import xavier_normal, constant

from sklearn.metrics.pairwise import cosine_similarity

# Set parameters
print("Set parameters...")
context_size = 10
embed_size = 500
x_max = 100
alpha = 0.75
batch_size = 50
l_rate = 0.001
num_epochs = 30

# define methods and classes
print("Define methods and classes...")
def clean_str(string):
    # Tips for handling string in python : http://agiantmind.tistory.com/31
    string = string.lower()
    string = re.sub(r"[^A-Za-z]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def weight_func(x):
    return 1 if x > x_max else (x / x_max) ** alpha

def next_batch(batch_size,word_u,word_v):

    words_co_occurences = np.array([co_occurence_matrix[word_u[i], word_v[i]] for i in range(batch_size)])
    words_weights = np.array([weight_func(var) for var in words_co_occurences])
    
    words_co_occurences = Variable(torch.from_numpy(words_co_occurences).cuda()).float()
    words_weights = Variable(torch.from_numpy(words_weights).cuda()).float()

    word_u = Variable(torch.from_numpy(word_u).cuda())
    word_v = Variable(torch.from_numpy(word_v).cuda())

    return word_u, word_v, words_co_occurences, words_weights

def most_similar(word_embeddings_array, word, result_num = 1):
    data = []
    num = word_embeddings_array.shape[0]
    target_index = word_to_index[word]
    for i in range(num):
        if i != target_index:
            data.append((index_to_word[i],cosine_similarity([word_embeddings_array[target_index]],[word_embeddings_array[i]])[0][0]))
    data.sort(key=lambda tup: tup[1], reverse=True)
    return data[:result_num]

class GloVe(nn.Module):
    def __init__(self, num_classes, embed_size):

        super(GloVe, self).__init__()

        self.num_classes = num_classes
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.in_embed.weight = xavier_normal(self.in_embed.weight)

        self.in_bias = nn.Embedding(self.num_classes, 1)
        self.in_bias.weight = xavier_normal(self.in_bias.weight)

        self.out_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)

        self.out_bias = nn.Embedding(self.num_classes, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)

    def forward(self, word_u, word_v):

        word_u_embed = self.in_embed(word_u)
        word_u_bias = self.in_bias(word_u)
        word_v_embed = self.out_embed(word_v)
        word_v_bias = self.out_bias(word_v)
        
        return ((word_u_embed * word_v_embed).sum(1) + word_u_bias + word_v_bias).squeeze(1)
    
    def embeddings(self):
        return self.in_embed.weight.data.cpu().numpy() + self.out_embed.weight.data.cpu().numpy()
    
# prepare data
print("Prepare data...")

stop = set(stopwords.words('english'))
word_list = list()

with open('ptb.train.txt') as f:
    for line in f:
        line = clean_str(line)
        for word in line.split():
            if word not in stop and len(word) > 1:
                word_list.append(word)
                
vocab = np.unique(word_list)
word_to_index = {word: index for index, word in enumerate(vocab)}
index_to_word = {index: word for index, word in enumerate(vocab)}
word_list_size = len(word_list)
vocab_size = len(vocab)

print("word_list_size", word_list_size)
print("vocab_size", vocab_size)

Set parameters...
Define methods and classes...
Prepare data...
word_list_size 503550
vocab_size 9462


In [2]:
num_classes = vocab_size
glove = GloVe(num_classes, embed_size)
glove.load_state_dict(torch.load("./glove.model"))
embedding_result = glove.embeddings()

In [3]:
word = index_to_word[6000]
print(word)
print(most_similar(embedding_result,word,result_num=5))

parking
[('con', 0.35182893), ('elliott', 0.34038278), ('psychological', 0.33453405), ('blues', 0.32255015), ('beautiful', 0.32028121)]


In [5]:
print(most_similar(embedding_result,"hardware",result_num=5))

[('mainframe', 0.27991056), ('compaq', 0.2648688), ('sunnyvale', 0.25959182), ('digital', 0.25702393), ('equipment', 0.24739763)]
