In [31]:
import torch
from random import *
from collections import Counter
import argparse
from huffman import HuffmanCoding
import time
import math
from tqdm import tqdm_notebook

In [33]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
mode = "SG" # "SG" for skipgram, "CBOW" for CBOW
part = "part" # "part" if you want to train on a part of corpus, "full" if you want to train on full corpus
ns = 20 # 0 for hierarchical softmax, the other numbers would be the number of negative samples

## Loading text8

In [4]:
if part=="part":
    text = open('text8',mode='r').readlines()[0][:1000000] #Load a part of corpus for debugging
elif part=="full":
    text = open('text8',mode='r').readlines()[0] #Load full corpus for submission
else:
    print("Unknown argument : " + part)
    exit()

## Preprocessing

In [5]:
word_seq = text.split()
corpus=[]
t = 10e-5
f = Counter(word_seq)
l = len(word_seq)

for word in word_seq:
    p = 1 - math.sqrt(t / (f[word] / l))
    if p <= random():
        corpus.append(word)

stats = Counter(corpus)
words = []

## Discard rare words

In [6]:
for word in corpus:
    if stats[word]>4:
        words.append(word)
vocab = set(words)

## Give an index number to a word

In [7]:
w2i = {}
w2i[" "]=0
i = 1
for word in vocab:
    w2i[word] = i
    i+=1
i2w = {}
for k,v in w2i.items():
    i2w[v]=k

## Code dict for hierarchical softmax

In [21]:
freqdict={}
freqdict[0]=10
for word in vocab:
    freqdict[w2i[word]]=stats[word]
codes, nodes = HuffmanCoding().build(freqdict)

## Frequency table for negative sampling

In [9]:
freqtable = [0,0,0]
for k,v in stats.items():
    f = int(v**0.75)
    for _ in range(f):
        if k in w2i.keys():
            freqtable.append(w2i[k])

## Build training set

In [12]:
cinput_set, ctarget_set = [], []
sinput_set, starget_set = [], []
window_size = 5

for j in range(len(words)):
    if j<window_size:
        cinput_set.append([0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)])
        ctarget_set.append(w2i[words[j]])

        sinput_set += [w2i[words[j]] for _ in range(window_size*2)]
        starget_set += [0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)]

    elif j>=len(words)-window_size:
        cinput_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)])
        ctarget_set.append(w2i[words[j]])

        sinput_set += [w2i[words[j]] for _ in range(window_size*2)]
        starget_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)]

    else:
        cinput_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)])
        ctarget_set.append(w2i[words[j]])

        sinput_set += [w2i[words[j]] for _ in range(window_size*2)]
        starget_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)]

In [13]:
print("Vocabulary size")
print(len(w2i))
print()

Vocabulary size
3971



In [28]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def cosine(v1, v2):
    return torch.sum(v1 * v2) / torch.sqrt(torch.sum(v1 * v1) * torch.sum(v2 * v2))

In [23]:
numwords = len(w2i)
dimension = 64
learning_rate = 0.025

## CBOW HS

In [36]:
# Xavier initialization of weight matrices
W_in = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
W_out = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
losses=[]

times = []
start_time = time.time()

print("# of training samples")
print(len(cinput_set))
print()

#Training word2vec using SGD(Batch size : 1)
for i, (contextWords, output) in tqdm_notebook(enumerate(zip(cinput_set,ctarget_set))):
    #Only use the activated rows of the weight matrix
    #activated should be torch.tensor(K,) so that activated W_out has the form of torch.tensor(K, D)
    activated = [nodes[codes[output][:j]] for j in range(len(codes[output]))]
    centerCode = codes[output]
    
    V, D = W_in.size()
    K, _ = W_out[activated].size()

    L = None
    G_in = torch.zeros(1, D).to(device)
    G_out = torch.ones(K, D).to(device)

    inputVector = torch.sum(W_in[contextWords].t(), dim=1, keepdim=True)

    p = 1
    for j in range(K):
        bti = 1 if centerCode[j] == '0' else -1
        vj = W_out[activated][j].reshape(1, D)
        vh = torch.mm(vj, inputVector)
        p *= sigmoid(bti * vh)

        tj = 1 if bti == 1 else 0
        grad = sigmoid(vh) - tj

        G_out[j] *= (grad * inputVector).reshape(D)
        G_in += grad * vj

    L = -torch.log(p).reshape(1)
    
    W_in[contextWords] -= learning_rate*G_in
    W_out[activated] -= learning_rate*G_out

    losses.append(L.item())
    if i%50000==0:
        avg_loss=sum(losses)/len(losses)
        elapsed_time = time.time() - start_time
        print("Loss : %f, Time : %f sec" %(avg_loss, elapsed_time,))
        losses=[]
        start_time = time.time()
        times.append(elapsed_time)

# of training samples
60431



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Loss : 10.174491, Time : 0.066962 sec
Loss : 7.963453, Time : 1159.172956 sec



## CBOW NS

In [None]:
# Xavier initialization of weight matrices
W_in = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
W_out = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
losses=[]

times = []
start_time = time.time()

print("# of training samples")
print(len(cinput_set))
print()

for i, (contextWords, output) in tqdm_notebook(enumerate(zip(cinput_set,ctarget_set))):
    #Only use the activated rows of the weight matrix
    #activated should be torch.tensor(K,) so that activated W_out has the form of torch.tensor(K, D)
    activated = [output] + sample(stats, NS)
    
    V, D = W_in.size()
    K, _ = W_out[activated].size()

    L = None
    G_in = None
    G_out = torch.Tensor(K, D).to(device)

    inputVector = torch.sum(W_in[contextWords].t(), dim=1, keepdim=True)

    p = sigmoid(torch.mm(W_out[activated][0].reshape(1, D), inputVector))
    L = -torch.log(p)
    G_in = -(1 - p) * W_out[activated][0]
    G_out[0] = -(1 - p) * inputVector.reshape(D)

    for k in range(1, K):
        q = sigmoid(-torch.mm(W_out[activated][k].reshape(1, D), inputVector))
        loss -= torch.log(q)
        G_in += (1 - q) * W_out[activated][k]
        G_out[k] = (1 - q) * inputVector.reshape(D)
        
    W_in[inputs] -= learning_rate*G_in
    W_out[activated] -= learning_rate*G_out

    losses.append(L.item())
    if i%50000==0:
        avg_loss=sum(losses)/len(losses)
        elapsed_time = time.time() - start_time
        print("Loss : %f, Time : %f sec" %(avg_loss, elapsed_time,))
        losses=[]
        start_time = time.time()
        times.append(elapsed_time)

## skipgram HS

In [None]:
# Xavier initialization of weight matrices
W_in = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
W_out = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
losses=[]

times = []
start_time = time.time()

print("# of training samples")
print(len(sinput_set))
print()

for i, (centerWord, output) in tqdm_notebook(enumerate(zip(sinput_set,starget_set))):
    #Only use the activated rows of the weight matrix
    #activated should be torch.tensor(K,) so that activated W_out has the form of torch.tensor(K, D)
    activated = [nodes[codes[output][:j]] for j in range(len(codes[output]))]
    
    V, D = inputMatrix.size()
    K, _ = outputMatrix.size()

    L = None
    G_in = torch.zeros(1, D).to(device)
    G_out = torch.ones(K, D).to(device)


    inputVector = W_in[centerWord].reshape(D, 1)

    p = 1
    for j in range(K):
        bti = 1 if contextCode[j] == '0' else -1
        vj = W_out[activated][j].reshape(1, D)
        vh = torch.mm(vj, inputVector)
        p *= sigmoid(bti * vh)

        tj = 1 if bti == 1 else 0
        grad = sigmoid(vh) - tj

        G_out[j] = (grad * inputVector).reshape(D)
        G_in += grad * vj

    L = -torch.log(p).reshape(1)
    
    W_in[inputs] -= learning_rate*G_in.squeeze()
    W_out[activated] -= learning_rate*G_out
    
    losses.append(L.item())
    if i%50000==0:
        avg_loss=sum(losses)/len(losses)
        elapsed_time = time.time() - start_time
        print("Loss : %f, Time : %f sec" %(avg_loss, elapsed_time,))
        losses=[]
        start_time = time.time()
        times.append(elapsed_time)

## skipgram NS

In [None]:
# Xavier initialization of weight matrices
W_in = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
W_out = torch.randn(numwords, dimension).to(device) / (dimension**0.5)
losses=[]

times = []
start_time = time.time()

print("# of training samples")
print(len(sinput_set))
print()

for i, (centerWord, output) in tqdm_notebook(enumerate(zip(sinput_set,target_set))):
    #Only use the activated rows of the weight matrix
    #activated should be torch.tensor(K,) so that activated W_out has the form of torch.tensor(K, D)
    activated = [output] + sample(stats, NS)
    L, G_in, G_out = skipgram_NS(inputs, W_in, W_out[activated])
    V, D = inputMatrix.size()
    K, _ = outputMatrix.size()

    L = None
    G_in = None
    G_out = torch.Tensor(K, D).to(device)

    inputVector = W_in[centerWord]

    p = sigmoid(torch.mm(W_out[activated][0].reshape(1, D), inputVector.reshape(D, 1)))
    L = -torch.log(p)
    G_in = -(1 - p) * W_out[activated][0]
    G_out[0] = -(1 - p) * inputVector

    for k in range(1, K):
        q = sigmoid(-torch.mm(W_out[activated][k].reshape(1, D), inputVector.reshape(D, 1)))
        L -= torch.log(q)
        G_in += (1 - q) * W_out[activated][k]
        G_out[k] = (1 - q) * inputVector
        
    W_in[inputs] -= learning_rate*G_in.squeeze()
    W_out[activated] -= learning_rate*G_out
    
    losses.append(L.item())
    if i%50000==0:
        avg_loss=sum(losses)/len(losses)
        elapsed_time = time.time() - start_time
        print("Loss : %f, Time : %f sec" %(avg_loss, elapsed_time,))
        losses=[]
        start_time = time.time()
        times.append(elapsed_time)

## Analogical_Reasoning_Task

In [None]:
questions = open("questions-words.txt", 'r').readlines()

total = [0]
answer = [0]
cnt = -1
for q in questions:
    if q[0] == ':':
        if cnt != -1:
            print("Result:", answer[cnt], '/', total[cnt])
            print("Accuracy:", round(answer[cnt] / total[cnt] * 100, 3), '%')
            print()
            total.append(0)
            answer.append(0)

        print(q[2:])
        cnt += 1
    else:
        flag = False
        for key in q.split():
            if w2i.get(key) == None:
                flag = True
        if flag:
            total[cnt] += 1
            continue

        [x1, y1, x2, y2] = q.split()

        vx1 = embedding[w2i[x1]]
        vy1 = embedding[w2i[y1]]
        vx2 = embedding[w2i[x2]]

        vector = vx1 - vx2 + vy1

        distance = [(cosine(vector, embedding[w]), w) for w in range(embedding.size()[0])]
        closest = sorted(distance, key=lambda t: t[0], reverse=True)[:10]

        if sum(map(lambda x: (x[1] == w2i[y2]), closest)) > 0:
            answer[cnt] += 1
        total[cnt] += 1

print("Result:", answer[cnt], '/', total[cnt])
print("Accuracy:", round(answer[cnt] / total[cnt] * 100, 3), '%')
print()
print("Total Result:", sum(answer), '/', sum(total))
print("Total Accuracy:", round(sum(answer) / sum(total) * 100, 3), '%')