In [None]:
from reader_utils.corpus_builder_utils import *
from note_utils.pitch_dictionary import PitchDictionary
from reader_utils.dataset_builder import DatasetBuilder
from model_utils.model_utils import *
import io

In [None]:
pd = PitchDictionary("dataset_objects/pitches_dict.txt")

vocab_size = pd.get_vocabulary_size()
window_size = 4 # 2, 4
num_ns = 10 # 2, 5, 10
embedding_layer_name = "n2v_embedding"

In [None]:
db = DatasetBuilder("dataset_objects/full_corpus")

In [None]:
# takes around 24 mins for window_size = 2, num_ns = 2
# takes around 25 mins for window_size = 2, num_ns = 10
# takes around 52 mins for window_size = 4, num_ns = 10
dataset = db.build_word_to_vec_dataset(
        vocab_size, window_size, num_ns, skip_amount=1, drop_limit=500)

In [None]:
embedding_dim = 16

n2v = build_note_to_vec_model(
        vocab_size, embedding_dim, num_ns, embedding_layer_name)

In [None]:
# takes around 12 mins for embedding_dim = 16, window_size = 2, 
# num_ns = 10, epochs = 20
epochs = 20
n2v.fit(dataset, epochs=epochs)

In [None]:
vocab = pd.get_vocabulary()
weights = n2v.get_layer(embedding_layer_name).get_weights()[0]

In [None]:
file_suffix = "ws" + str(window_size) + "_ns" + str(num_ns) + \
        "_ed" + str(embedding_dim) + "_ep" + str(epochs)
vactors_file_name = "vectors_" + file_suffix + ".tsv"
metadata_file_name = "metadata_" + file_suffix + ".tsv"

vectors_file = io.open(vactors_file_name, 'w', encoding='utf-8')
metadata_file = io.open(metadata_file_name, 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index in [0, 1]: # <unk> and es
        continue
    vec = weights[index]
    vectors_file.write('\t'.join([str(x) for x in vec]) + "\n")
    metadata_file.write(word + "\n")

vectors_file.close()
metadata_file.close()