# WORD EMBEDDINGS

# Pre processing

In the pre processing the goal is to split dataset on training, validation and testing dataset. In order to achieve a better split first we find the average number of words each description has. Then based on that we compute how many descriptions are necessary for each set using the rule of thump, which is 80%-20% split.

After the splitting based only on training set we create the vocabulary remove rare words 
and define model's parameters.

In [24]:
# import labriares

import os 
import json 
import nltk
import random
import string
import collections
import numpy as np
from random import seed
from random import randint
from datetime import datetime
from nltk.corpus import stopwords

In [25]:
# this function is only for debugging purposes.
# it search a string in the descriptions and print which 
# description contains it 
def search_string(descriptions,word_to_search):
    for counter,value in enumerate(descriptions):
        if word_to_search in value:
            print("found the word on issue",counter+1)
            print(value)

In [26]:
# the pre_processing function loads all the descriptions in a list. Split every description in 
# words then cleans the data from stop words, punctuation and lowers all letters.

def pre_processing(dir_path,descriptions,all_stopwords):
    
    total_words    = 0
    counter        = 0
    counter_issues = 0
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            counter += 1
            print(counter,") reading file",fname)
            
            #load data in json format
            data = json.load(json_file)
            for p in data:
                
                ##############################
                issue_name     = p['name']
                counter_issues += 1
                print("  ",counter_issues,")",issue_name)
                ##############################
                
                issue_desc     = p['description'] 
                clean_desc     = clean_data(issue_desc,all_stopwords)
                total_words    = total_words + len(clean_desc)
                descriptions.append(clean_desc)
            
    return len(descriptions),total_words/len(descriptions)

In [27]:
def clean_data(description,all_stopwords):
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    #join all lines into one sentence
    sentence     = ' '.join(description)
    
    #translate punctuation
    new_sentence = sentence.translate(translator)
    
    #split the sentense in words
    words = new_sentence.split()
    words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
    
    return words_sw

In [28]:
# this function splits the dataset into training validation and testing dataset.
# It randomly selects test_size descriptions for testing 
# and valid_size descriptions for validation. 
# Also keeps the initial indexes for debugging purposes.

def split_dataset(descriptions,train_set,valid_set,valid_size,
                  test_set,test_size,words_per_desc):
    
    seed(datetime.now())
    valid_index   = []
    test_index    = []
    
    #random select valid size descriptions from the dataset
    for i in range(valid_size):
        flag = True
        while flag:
            temp = randint(0,num_issues-1)
            if temp not in valid_index and len(descriptions[temp])>= 2*words_per_desc:
                valid_index.append(temp)
                flag = False
    
    #random select testing size descriptions from the dataset
    for i in range(test_size):
        flag = True
        while flag:
            temp = randint(0,num_issues-1)
            if temp not in valid_index and temp not in test_index and len(descriptions[temp])>= 2*words_per_desc:
                test_index.append(temp)
                flag = False
    
    for i in valid_index:
        valid_set.append(descriptions[i])
    
    for i in test_index:
        test_set.append(descriptions[i])
        
    for i in range(len(descriptions)):
        if i not in valid_index and i not in test_index:
            train_set.append(descriptions[i])
    
    descriptions = []
    
    #debugging prints
    #print("the issues used for validation are")
    #for i in valid_index:
    #    print(i)
    #print("#############")
    #print("the issues used for testing are")
    #for i in test_index:
    #    print(i)

In [29]:
# same function as before but its more efficient 
# but doesnt keep the indexes of the description used for validation and testing
# for debugging purposes the first one is used

def split_dataset2(descriptions,train_set,valid_set,valid_size,
                   test_set,test_size,words_per_desc):
    
    seed(datetime.now())
    
    for i in range(valid_size):
        temp = randint(0,len(descriptions)-1)
        if len(descriptions[temp])>= 2*words_per_desc:
            valid_set.append(descriptions.pop(temp))
        
    for i in range(test_size):
        temp = randint(0,len(descriptions)-1)
        if len(descriptions[temp])>= 2*words_per_desc:
            test_set.append(descriptions.pop(temp))
        
    train_set = descriptions

In [30]:
# define necessary parameters
dir_path        = '/home/kostas/Documents/thesis/data_1'
descriptions    = []

#the first time the below command should run to download stopwords
#nltk.download('stopwords')
all_stopwords = set(stopwords.words('english'))

#pre processining stage starts
num_issues,mean_words    = pre_processing(dir_path,descriptions,all_stopwords) 

# for validation and testing we will random choose 1/10 words from every desc.
words_per_desc = int(mean_words // 10)

# validation set has length 100 words
valid_words = 100
valid_size  = int(valid_words // words_per_desc)

# testing set has length 200 words
test_words = 200
test_size  = int(test_words // words_per_desc)

# split dataset

train_set = []
valid_set = []
test_set  = []

#search a string in descriptions only for debugging purposes.
#search_string(descriptions,"cachedetailactivity")

split_dataset(descriptions,train_set,valid_set,valid_size,test_set,test_size,words_per_desc)


1 ) reading file data_word_emb72.json
   1 ) Samsung multi window no longer working
   2 ) Modified coords not shown
   3 ) feature request: option to allow caches in only 1 list
   4 ) Unable to run gradle on master branch
   5 ) blurred text, new behaviour - or HW accel setting changed/reset by update
   6 ) Czech opencaching?
   7 ) versioneye in pull requests
   8 ) NPE after editing personal note
   9 ) Support ignoring and watching caches
   10 ) Install Java 8 on "master" slave
   11 ) Not possible to search for GeoKrety code by scanning QR code from GK label
   12 ) Mapsforge Offline germany.map missing some tiles for zoomlevel 5-7
   13 ) Live caches vanish when disabling updates
   14 ) Need graphics for developer page
   15 ) Next bugfix release
   16 ) auto accept android licenses
   17 ) Execution failed for task ':main:verifyCgeoKeys'. > You must provide keys in main/res/values/keys.xml for cgeo to compile successfully. You can copy from main/templates/keys.xml and just a

   550 ) Cache type filtering fails on live map
23 ) reading file data_word_emb144.json
   551 ) OSM slow on Android 4.4
   552 ) Support Trackables With the QR Scanner Included in c:geo
   553 ) Waypoint description gets mixed up 
   554 ) WP from personal note doubled
   555 ) Waypoint description gets mixed up 
   556 ) Caches on map
   557 ) Default TB action only for founds/attended logs
   558 ) Bug with offline maps
   559 ) CZ Translation
   560 ) Importing .loc file has wrong progress dialog
   561 ) Use different visualisation for past events
   562 ) Auto extract coordinates from text
   563 ) Nearby search slow
   564 ) Close pop-up on deletion
   565 ) Limit static map loading to 50 requests per minute
   566 ) Upload personal note on refresh?
   567 ) Radar opened twice from waypoint tab
   568 ) cgeo crashes wit out of memory error
   569 ) GPX export failed
   570 ) Cache rating, popularity, TB info not exported to GPX file
   571 ) Sort by favorite percentage should be

53 ) reading file data_word_emb174.json
   1294 ) Light theme broken
   1295 ) Wrong "light theme" French translation
   1296 ) Support of Multi Window / Split Screen
   1297 ) Owner information not shown for OC.de caches
   1298 ) Don't store/replace static maps if requests are over quota
   1299 ) Live map doesn't populate with all caches
   1300 ) Map doesn't react on changes of caches
   1301 ) Feedback needed: "Soft save for offline"
   1302 ) Missing settings option on c:geo homepage
   1303 ) Use waypoint prefix (or lookup code) instead of id
   1304 ) Waypoints not loaded from website
   1305 ) change search by address
   1306 ) A geocache can only exist in one list ? 
   1307 ) Reproducible OutOfMemoryError when trying to download caches from map (Zoom level 1km) on ZTE Blade
   1308 ) Provide settings backup as part of database backup
   1309 ) Download from map is dismissed by rotate
   1310 ) Nearby search repeated on device rotation
   1311 ) Crash when resuming c:geo
   1

85 ) reading file data_word_emb34.json
   2094 ) support tags for caches
   2095 ) Logs not loading for basic members
   2096 ) Dont have proximity notification active by default
   2097 ) Crash on Theme options change
   2098 ) Mapsforge - Zoom level inconsistent between online and offline
   2099 ) Move to AndroidX
   2100 ) pocket query import results in wrong number of favorites
   2101 ) Slightly "dancing" GPS location triggers syncLayers
   2102 )                  Bug 
   2103 ) Setting backup location not shown and not findable
   2104 ) Program setting backup menu items not updated after execution
   2105 ) Can't log in to geocaching.com when using VPN
   2106 ) API to extremcaching.com not working
   2107 ) CI fails constantly for Geocoder test (GRPC failed)
   2108 ) Should favorite point display use "x" or not
   2109 ) Sliders in settings might are hard to set to distinct value
   2110 ) Temporary disable proximity tone or make it follow device settings
   2111 ) Update the

115 ) reading file data_word_emb213.json
   2844 ) Visual login and count on homescreen
   2845 ) Use plurals
   2846 ) Filter for attributes
   2847 ) empty log image title
   2848 ) send to c:geo missing in search nearest
   2849 ) download offline maps via user interface
   2850 ) Offline Map
   2851 ) Offline found count
   2852 ) mention standard GPX directory in progress dialog
   2853 ) replace term standard navigation
   2854 ) List doesn't refresh for cache dropped from detail view
   2855 ) Lists revert back to distance sort after viewing a cache
   2856 ) c:geo doesn't handle unknown trackables
   2857 ) Show day of week for events
   2858 ) Import waypoint GPX also for renamed downloaded files
   2859 ) Replace note-marker by more meaningful icon
   2860 ) Stored OC-caches marked as unreliable on live-map
   2861 ) Class not found - Market error new in 4.1.12
   2862 ) Sorting problem - market error
   2863 ) mark non GC caches as reliable
   2864 ) Holo theme
   2865 ) Imp

   3604 ) Disable projection field in waypoint edit for WPs with coords
   3605 ) Recommendation of Cyanogenmod in FAQ
   3606 ) Allow c:geo to be installed on more devices using play store
   3607 ) Timezone question
   3608 ) calendar time recognition requires space
   3609 ) The Log Date Format in System Information does not always provide the intended information
   3610 ) Route disappears
   3611 ) Route rendering
   3612 ) Mapsforge Beta - render performance
   3613 ) Streetview has stopped working....
   3614 ) ci.cgeo.org is out of service
   3615 ) Empty Logbook 
   3616 ) Crash while posting text-less log
   3617 ) Wrong URL used for smilies in logs
   3618 ) Crash when decoding JPEG image
146 ) reading file data_word_emb35.json
   3619 ) Support new GC cache types
   3620 ) Charter member unable to see PM-only caches on map
   3621 ) Proximity notification follow up
   3622 ) Icon not updated on change of waypoint visited state
   3623 ) Resort menu items after tts has been 

   4360 ) Empty log should be caught in log activity
   4361 ) Field notes export window does not remember my preference
   4362 ) Remove Field Note Progress
   4363 ) Coords in anywhere menu lost
   4364 ) samsung s3 using talking compass with google tts only prompts for language download and does not speak
   4365 ) Whitespace is inserted for signature placeholders
   4366 ) Application crash when rotating the phone while posting log
   4367 ) Nighly Build failing due to Null analysis by Eclipse JDT 
   4368 ) NPE when selected image cannot be decoded
176 ) reading file data_word_emb198.json
   4369 ) An option to Set trackables to discover
   4370 ) Wrong progress information when uploading field notes
   4371 ) In NB the new Button "favorite"is not visible
   4372 ) Wrong behaviour on "drop all and remove list"
   4373 ) "delete all and list" should not be enabled when filter is active
   4374 ) Exception deleting list content
   4375 ) EditWaypointActivity - finalDefined not alway

209 ) reading file data_word_emb203.json
   5194 ) IRC channel availability
   5195 ) HTTP Multipart POST implementation required
   5196 ) Cachelist context menu action on wrong cache
   5197 ) Map from stored list not showing all in list
   5198 ) Generic Export and Implementations
   5199 ) Live map initialization
   5200 ) Live map defaults to (same) location in Germany
   5201 ) 02.04.2012 NullPointer
   5202 ) Existing stored cached have orange circle in live map
   5203 ) Default Setting for StaticMaps should be OFF
   5204 ) statics maps not stored with caches
   5205 ) Nearby search for PM shown directional images
   5206 ) Black screen /Solid screen  instead of wallpaper in main-menu
   5207 ) Long descriptions not displayed?
   5208 ) Update download link on cgeo.org
   5209 ) Time schedule 
   5210 ) Also request more details in popup for known cache types
   5211 ) Support NFC tag-based logging
   5212 ) Custom date formats
   5213 ) Live map mixes up Traditional and Myste

240 ) reading file data_word_emb122.json
   5969 ) Error when no log present for an Okapi cache
   5970 ) Cannot erase personal note from server
   5971 ) Upload of personal note not working
   5972 ) NPE when refreshing OC.de cache
   5973 ) Failed logging on oc reports incorrect and misleading error
   5974 ) No caches returned from OC.de
   5975 ) New UI for talking compass needed
   5976 ) Login error message while still logging in
   5977 ) verify gradle debug with ProGuard enabled
   5978 ) Play service geo provider blocking on emulator
   5979 ) offline maps on sd card not found
   5980 ) Add option to edit log
   5981 ) Empty space on logbook
   5982 ) Notify when saved caches are nearby
   5983 ) "Partner mode" show caches a member of a group didn't found
   5984 ) 2014.08.07-NB-f3d487e crashes on app start
   5985 ) Error 400 and NPE to all OC-sites
   5986 ) Doubled "next feature release" text
   5987 ) CI won't build cgeo
   5988 ) HTML tags shown in logs on OC.pl
   5989 )

In [31]:
# print sets for debugging
# print(train_set)
# print(valid_set)
# print(test_set)

In [32]:
# print messages #
print("total issues",num_issues)
print("average number of words per description",mean_words)
print("size of validation set",valid_size)
print("size of test set",test_size)
#print(descriptions)

total issues 5993
average number of words per description 41.79726347405306
size of validation set 25
size of test set 50


## Compute Word Embeddings

After the pre processing step, now its time to create the vocabulary and the skip-grams pairs and train the model. Every model parameter is a list, so we compute embeddings for every combination. Testing every combination on validation set and keep the model with the best resutls.

In [33]:
embedding_dim_list = [25,50,100,150,200]
learning_rate_list = [0.01,0.1,1]
skip_window_list   = [2,4,6,8]
num_sampled_list   = [32,64,100,128]
num_epochs_list    = [10,20,50,100]

unk_word      = "UNK"
batch_size    = 100
min_occurance = 5
num_skips     = 2

In [34]:
def save_vocabulary(word_dict):
    file = open("vocabulary.txt","w")
    for key in word_dict:
        file.write("%s, %s \n"%(key,str(word_dict[key])))
    file.close()

In [35]:
#create vocabulary 
#remove rare words from vocabulary which occurrs less time than min_occurance

#word2id     :dictionary which contains the vocabulary and it's int id
#id2word     :dictionary which assigns its id to corresponding word

temp_sentences = []
for i in train_set:
    for j in i:
        temp_sentences.append(j)
    
count  = []
count.extend(collections.Counter(temp_sentences).most_common())

# list temp_sentences now is useless
temp_sentences = []

for i in range(len(count)-1,-1,-1):
    if count[i][1]<min_occurance:
        count.pop(i)
    else:
        break
        
            
#compute the vocabulary size
vocabulary_size = len(count)

#assign an id to each word
word2id = dict()
word2id[unk_word] = -2

for i, (word,_) in enumerate(count):
    word2id[word] = i

#list count now is useless
count = []

#express train, valid and test set using id
train_set_id = list()
valid_set_id = list()
test_set_id  = list()

for desc in train_set:   
    temp_desc = []
    for j in desc:
        temp_desc.append(word2id.get(j,-2))
    
    #make sure that there are more than num_skips words in order to take account the description
    counter = 0
    for i in temp_desc:
        if i != -2:
            counter += 1
        
    if counter > num_skips :
        train_set_id.append(temp_desc)
    
#list train_set now is useless
train_set = []

for desc in valid_set:
    temp_desc = []
    for j in desc:
        temp_desc.append(word2id.get(j,-2))
    valid_set_id.append(temp_desc)
    
#list valid_set now is useless
valid_set = []

for desc in test_set:
    temp_desc = []
    for j in desc:
        temp_desc.append(word2id.get(j,-2))
    test_set_id.append(temp_desc)

#list test_set now is useless
test_set = []

#save the vocabulary in file 
save_vocabulary(word2id)

In [36]:
#some prints for debugging purposes
#print(len(temp_sentences))

#print(word2id)

#for i in range(len(count)-1,-1,-1):
#    print(count[i])

#for i in range(len(count)-1,-1,-1):
#    if count[i][0] == '20cachetag':
#        print(count[i])

#total_words = 0
#for i in train_set_id:
#    total_words += len(i)
#print(total_words)

In [37]:
# this function for every description in the description_set choose randomly words_per_desc
# words and create num_skips inside the skip_window.
# The only restriction is the chosen word to be in the dictionary

def testing_skip_grams(list_grams,skip_window,num_skips,
                       descriptions_list,words_per_desc):
    
    #an important constraint
    assert num_skips<=skip_window
    
    span = 2*skip_window+1
    
    for desc in descriptions_list:
        
        #for debugging
        #print("Description:",desc)
        
        seed(datetime.now())
        target_words = []
        for i in range(words_per_desc):
            flag = True
            skip_gram_list = []
            while flag: 
                temp = randint(0,len(desc)-1)
                
                if desc[temp] not in target_words and desc[temp] != -2:
                    
                    flag = False
                    target_words.append(desc[temp])
                    
                    temp_list     = []
                    context_words = []

                    # create skip-grams for the target word.
                    find_context_words(desc,temp,skip_window,span,temp_list)
                    
                    #take num_skips random samples
                    context_words = [w for w in range(1,len(temp_list))]
                    words_to_use  = random.sample(context_words,num_skips)
                    
                    skip_gram_list.append(temp_list[0])
                    for temp_word in words_to_use:
                        skip_gram_list.append(temp_list[temp_word])
                        
            
            list_grams.append(skip_gram_list)
            ## some print messages for debugging purposes.
            #print("target word:",target_words[i]) 
            #print("temp_list:",temp_list)
            #print("words to use:",words_to_use)
            #print("skip-grams:",skip_gram_list)
        
        #print("choosen word:",target_words)

In [38]:
def find_context_words(description,word_index,skip_window,span,grams_list):
    
    # the target word in the first place
    grams_list.append(description[word_index])
    
    # initialize two pointer
    counter = 1
    data_index = word_index-1
    
    while counter<span:
        #look left from target word
        if counter<=skip_window:
            #if data_index<0 => out of bound no more words to take into account
            if data_index<0:
                data_index = word_index + 1
                counter    = skip_window + 1
            #if the word is not in the dictionary skip it
            elif description[data_index] == -2:
                #print("before:",data_index)
                #print(description[data_index],word2id.get(description[data_index],-2))
                data_index -= 1 
                #print("after:",data_index)
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index -= 1
                if counter >skip_window:
                    data_index = word_index+1
        #look right from target word
        else:
            if data_index>=len(description):
                counter = span + 1
            elif description[data_index] == -2:
                #print(description[data_index],word2id.get(description[data_index],-2))
                data_index += 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index += 1

In [39]:
def save_grams_to_file(test_grams):
    with open('test_set.txt','w') as f:
        for item in test_grams:
            for i in item:
                f.write("%s, "%i)
            f.write("\n")

In [40]:
#create validation and test pairs
valid_grams = []
test_grams  = []

#create 2 pairs for every word in a window of size of 2.
#create skip grams for test set
testing_skip_grams(test_grams, 2, 2, test_set_id, words_per_desc)

#save the test grams in a file in order not to overload memory.
save_grams_to_file(test_grams)

#test grams now are useless
test_grams = []

#create skip grams for valid set.
testing_skip_grams(valid_grams,2,2,valid_set_id,words_per_desc)

## Model Definition and Word Embedding Computation

In this section, all functions are asocciated with the definition of the model the training process and last but not least the evaluation of the efficiency of the model

In [41]:
import math
import time
import numpy as np
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()

In [42]:
def generate_batch(batch_size,num_skips,skip_window,train_set_id,word_pointer,desc_pointer,epoch):
    
    assert batch_size % num_skips == 0
    assert num_skips <= skip_window
    
    # the batch stores target words
    batch = np.ndarray(shape = (batch_size),dtype = np.int32)
    # labels are the context words=>(skip-grams)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.int32)
    
    span = 2*skip_window+1
    buffer = collections.deque(maxlen = span)
    
    for i in range(batch_size // num_skips):
         
        #avoid tags with -2 
        while train_set_id[desc_pointer][word_pointer] <0:
            word_pointer += 1
            if word_pointer > len(train_set_id[desc_pointer])-1:
                word_pointer  = 0
                desc_pointer += 1
                if desc_pointer > len(train_set_id)-1:
                    desc_pointer =0
                    epoch += 1
                
        find_context_words(train_set_id[desc_pointer],word_pointer,skip_window,span,buffer)
        
        #take num_skips random samples 
        context_words = [w for w in range(1,len(buffer))]
        words_to_use  = random.sample(context_words,num_skips)
        
        # print("description",desc_pointer, "target word",word_pointer,"words_to_use:",words_to_use,"buffer:",buffer)
        
        # update batch and labels
        for j,random_word in enumerate(words_to_use):
            batch[i*num_skips+j]    = buffer[0]
            labels[i*num_skips+j,0] = buffer[random_word]
            
        buffer.clear()
        
        if word_pointer == len(train_set_id[desc_pointer])-1:
            
            word_pointer  = 0
            desc_pointer += 1
            if desc_pointer > len(train_set_id)-1:
                desc_pointer = 0
                epoch += 1
        else:
            word_pointer += 1
            
    return batch,labels,epoch,word_pointer,desc_pointer


In [43]:
def model_def(train_set_id,batch_size,embedding_dim,skip_window,
              num_skips,num_sampled,learning_rate,vocabulary_size,total_epochs,testing_grams):
    
    start_time = time.time()
    
    # Input data
    X_train = tf.placeholder(tf.int32, shape=[None])
    # Input label
    Y_train = tf.placeholder(tf.int32, shape=[None, 1])
    
    #ensure that the following ops & var are assigned to CPU
    with tf.device('/cpu:0'):
        
        #create the embedding variable wich contains the weights
        embedding = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
        
        #create the lookup table for each sample in X_train=>avoiding to use one_hot encoder
        X_embed   = tf.nn.embedding_lookup(embedding,X_train) 
        
        #create variables for the loss function
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_dim],stddev=1.0))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
    loss_func = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,biases =nce_biases,
                                              labels = Y_train,inputs = X_embed, 
                                              num_sampled = num_sampled,
                                              num_classes = vocabulary_size ))
        
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        
    train_opt = optimizer.minimize(loss_func)
        
    #Define initializer for tensorflow variables
    init = tf.global_variables_initializer()
    
    
    with tf.Session() as sess:
        
        #actual initialize the variables
        sess.run(init)
        
        epoch        = 0 
        average_loss = 0
        desc_pointer = 0
        word_pointer = 0
        step_counter = 0
        av_losses_list = []
        
        while epoch <= total_epochs-1:
            
            step_counter += 1
            
            #take new batch of data
            batch_x,batch_y,epoch_temp,word_pointer,desc_pointer = generate_batch(batch_size,num_skips,
                                                                                  skip_window,train_set_id,
                                                                                  word_pointer,desc_pointer,
                                                                                  epoch)
            
            _,loss = sess.run([train_opt,loss_func],feed_dict={X_train:batch_x, Y_train:batch_y})
            
            average_loss += loss
            
            if epoch_temp != epoch:
                epoch = epoch_temp
                if step_counter > 0:
                    average_loss /= step_counter
                    av_losses_list.append(average_loss)
                    average_loss = 0
                    step_counter = 0
        
        #normalize embeddings before using them
        norm = tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims = True))
        normalized_embedding = embedding/norm
        normalized_embedding_matrix = sess.run(normalized_embedding)
    
    total_time = time.time() - start_time
    
    # evaluate the model.
    mean_loss = model_evaluation(testing_grams,normalized_embedding_matrix)
    
    save_log(av_losses_list,mean_loss,total_time,embedding_dim,skip_window,num_sampled,
             learning_rate,total_epochs)
    
    return normalized_embedding_matrix

In [84]:
# In order to measure the quality of word's embeddings we compute the cosine similarity for pairs of words
# that are true neigbhors. We define as loss the absolute value of 1-cosine similarity. 
# The total loss is the mean of them.
# We want to minimize that loss, which means that words that are trully neighobrs we will be close in the 
# embedding space

def model_evaluation(random_grams,embedding_matrix):
    
    total_loss    = 0 
    total_counter = 0
    
    for exp in random_grams:
        target_word = exp[0]
        w_t         = embedding_matrix[target_word]
        
        for j in range(1,len(exp)):
            context_word   = exp[j]
            w_temp         = embedding_matrix[context_word]
            result         = np.dot(w_t,w_temp)/(np.sqrt(np.dot(w_t,w_t))*np.sqrt(np.dot(w_temp,w_temp)))
            total_loss    += abs(1-result) 
            total_counter += 1
    
    return total_loss/total_counter

In [85]:
def save_log(av_losses_list,mean_loss,total_time,embedding_dim,skip_window,
             num_sampled,learning_rate,total_epochs):
    
    file = open("logs.txt","a+")
    
    file.write("EMBEDDING MODEL: embedding_dim = %s, skip_window = %s , total epochs = %s, learning rate = %s, negative samples = %s \n"
              % (str(embedding_dim),str(skip_window),str(total_epochs),str(learning_rate),str(num_sampled)))
    
    file.write("average losses per epoch \n")
    
    for counter,value in enumerate(av_losses_list):
        file.write("epoch %s , average lost %s \n"%(str(counter + 1),str(value)))
    
    file.write("training time in seconds %s "%(str(total_time)))
    file.write("evaluation result: %s \n"%(str(mean_loss)))
    
    file.close()
    

In [None]:
for embedding_dim in embedding_dim_list:
    for learning_rate in learning_rate_list:
        for skip_window in skip_window_list:
            for num_sampled in num_sampled_list:
                for total_epochs in num_epochs_list:
                    
                    print("RUN EMBEDDINGS FOR: embedding dimension:",embedding_dim,"learning rate:",
                          learning_rate,"skip window:",skip_window,"num_sampled:",num_sampled,
                          "epochs:",total_epochs)
                    
                    embedding_matrix = model_def(train_set_id,batch_size,embedding_dim,skip_window,
                                                    num_skips,num_sampled,learning_rate,vocabulary_size,
                                                    total_epochs,valid_grams)
                

In [86]:
embedding_dim = 100
skip_window   = 4
num_sampled   = 64
learning_rate = 0.1
total_epochs  = 30
embedding_matrix = model_def(train_set_id,batch_size,embedding_dim,skip_window,
                                                    num_skips,num_sampled,learning_rate,vocabulary_size,
                                                    total_epochs,valid_grams)

np.savetxt('word_embeddings.txt', embedding_matrix, fmt='%.8f')

In [None]:

def save_word_embeddings(embedding_matrix):
    
    file = open("word_embeddings.txt","w")
    for w_e in embedding_matrix:
        for j in w_e:
            file.write("%s,"%(str(j)))
        file.write("\n")
        
    file.close()