In [1]:
import torch
from random import *
from collections import Counter
import argparse
from huffman import HuffmanCoding
import time
import math

In [4]:
mode = "SG" # "SG" for skipgram, "CBOW" for CBOW
part = "part" # "part" if you want to train on a part of corpus, "full" if you want to train on full corpus
ns = 0 # 0 for hierarchical softmax, the other numbers would be the number of negative samples

## Loading text8

In [6]:
if part=="part":
    text = open('text8',mode='r').readlines()[0][:1000000] #Load a part of corpus for debugging
elif part=="full":
    text = open('text8',mode='r').readlines()[0] #Load full corpus for submission
else:
    print("Unknown argument : " + part)
    exit()

loading...


## Preprocessing

In [8]:
word_seq = text.split()
corpus=[]
t = 10e-5
f = Counter(word_seq)
l = len(word_seq)

for word in word_seq:
    p = 1 - math.sqrt(t / (f[word] / l))
    if p <= random():
        corpus.append(word)

stats = Counter(corpus)
words = []

## Discard rare words

In [11]:
for word in corpus:
    if stats[word]>4:
        words.append(word)
vocab = set(words)

## Give an index number to a word

In [12]:
w2i = {}
w2i[" "]=0
i = 1
for word in vocab:
    w2i[word] = i
    i+=1
i2w = {}
for k,v in w2i.items():
    i2w[v]=k

## Code dict for hierarchical softmax

In [13]:
freqdict={}
freqdict[0]=10
for word in vocab:
    freqdict[w2i[word]]=stats[word]
codedict, nodecode = HuffmanCoding().build(freqdict)

## Frequency table for negative sampling

In [14]:
freqtable = [0,0,0]
for k,v in stats.items():
    f = int(v**0.75)
    for _ in range(f):
        if k in w2i.keys():
            freqtable.append(w2i[k])

## Build training set

In [16]:
input_set = []
target_set = []
window_size = 5
if mode=="CBOW":
    for j in range(len(words)):
        if j<window_size:
            input_set.append([0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)])
            target_set.append(w2i[words[j]])
        elif j>=len(words)-window_size:
            input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)])
            target_set.append(w2i[words[j]])
        else:
            input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)])
            target_set.append(w2i[words[j]])
if mode=="SG":
    for j in range(len(words)):
        if j<window_size:
            input_set += [w2i[words[j]] for _ in range(window_size*2)]
            target_set += [0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)]
        elif j>=len(words)-window_size:
            input_set += [w2i[words[j]] for _ in range(window_size*2)]
            target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)]
        else:
            input_set += [w2i[words[j]] for _ in range(window_size*2)]
            target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)]

In [17]:
print("Vocabulary size")
print(len(w2i))
print()

Vocabulary size
3971

