In [2]:
# Create a word2vec model using the ski-gram approach and negative sampling. 
# For the implementation the library tensorflow was used.

In [3]:
import os
import json
import nltk
import string
import random
import collections
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from nltk.corpus import stopwords

Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
#the first time the below command should run to download stopwords
#nltk.download('stopwords')
all_stopwords = set(stopwords.words('english'))

#initialize necessary variables 
dir_path      = '/home/kostas/Documents/thesis/data'
descriptions  = []
delimiter     = "!n_s!"
unk_word      = "UNK"
embedding_dim = 100
min_occurance = 2
num_sampled   = 64
learning_rate = 0.1
num_steps     = 5
batch_size    = 100
num_skips     = 2
skip_window   = 3
data_pointer  = 0 

In [5]:
def clean_data(description,all_stopwords):
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    #join all lines into one sentence
    sentence     = ' '.join(description)
    
    #translate punctuation
    new_sentence = sentence.translate(translator)
    
    #split the sentense in words
    words = new_sentence.split()
    words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
    
    return words_sw

In [6]:
# read documents
# clear the data and 
# tokenize it
for fname in os.listdir(dir_path):
    with open(os.path.join(dir_path,fname)) as json_file:
        
        #load data in json format
        data = json.load(json_file)
        for p in data:
            issue_desc = p['description']
            clean_desc = clean_data(issue_desc,all_stopwords)
            descriptions.append(delimiter)
            descriptions = descriptions + clean_desc
            #print("##################")
            #print(issue_desc)
            #print("\n")
            #print(clean_desc)

In [7]:
#create vocabulary 
#remove rare words from vocabulary which occurrs less time than min_occurance

#descriptions:list which contains the sentences in tokens
#word2id     :dictionary which contains the vocabulary and it's int id
#id2word     :dictionary which assigns its id to corresponding word

count  = []
count.extend(collections.Counter(descriptions).most_common())
for i in range(len(count)-1,-1,-1):
    if count[i][1]<min_occurance:
        count.pop(i)
    else:
        break

#compute the vocabulary size substruct 1 cause we dont have to count the delimiter !n_s!
vocabulary_size = len(count)-1
    
#assign an id to each word
word2id = dict()
word2id[unk_word] = -2
for i, (word,_) in enumerate(count):
    if word == delimiter:
        word2id[word] =  -1
    else:
        word2id[word] = i

#create and the opposite dictionary for easy search based on id
id2word = dict(zip(word2id.values(),word2id.keys()))

#express descriptions using id
data = list()
while len(descriptions) != 0:
    temp    = descriptions.pop(0)
    temp_id = word2id.get(temp,-2)
    data.append(temp_id)

In [8]:
def create_skip_grams(batch_size,num_skips,skip_window,data):
    
    global data_pointer
    
    #some important constraints
    assert batch_size % num_skips == 0
    assert num_skips <= skip_window
    
    # the batch stores target words
    batch  = np.ndarray(shape = (batch_size),dtype = np.int32)
    # labels are the context words=>(skip-grams)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.int32)
    
    #ski-gram=>you can see skip items right and left from current position
    span = 2*skip_window+1
    
    buffer = collections.deque(maxlen = span)
    
    for i in range(batch_size // num_skips):
        
        #avoid tags with id -1,-2
        while data[data_pointer] < 0:
            data_pointer += 1
        
        #fill the buffer
        find_context_words(buffer,data,skip_window,span)
        
        #take num_skips random samples 
        context_words = [w for w in range(1,len(buffer))]
        words_to_use  = random.sample(context_words,num_skips)
        
        #print("target word",data_pointer,"words_to_use:",words_to_use,"buffer:",buffer)
        #update batch and labels
        for j,random_word in enumerate(words_to_use):
            batch[i*num_skips+j]    = buffer[0]
            labels[i*num_skips+j,0] = buffer[random_word]
            
        #clear buffer for the next target word
        buffer.clear()
        
        if data_pointer == len(data):
            data_pointer = 0
        else:
            data_pointer += 1
    
    return batch,labels

In [95]:
def find_context_words(buffer,data,skip_window,span):
    
    global data_pointer
    counter   = 1
    l_pointer = 1
    r_pointer = 1
    
    #the target word in the first place
    buffer.append(data[data_pointer])
    
    while counter<span: 
        #look left from target word
        if counter <=skip_window:
            #if encounter -1 => new description meaning that cannot take another word 
            if data[data_pointer - l_pointer] == -1:
                counter = skip_window + 1
            elif data[data_pointer - l_pointer] == -2:
                l_pointer += 1
            else:
                buffer.append(data[data_pointer - l_pointer])
                l_pointer += 1
                counter   += 1
        #look right from target word
        else:
            #if encounter -1 => new description meaning that cannot take another word
            if data[data_pointer + r_pointer] == -1:
                counter = span
            #if encounter -2 => unknown word so we look the next one
            elif data[data_pointer + r_pointer] == -2: 
                r_pointer += 1
            else:
                buffer.append(data[data_pointer + r_pointer])
                r_pointer += 1
                counter   += 1

In [97]:
# train the model

#input data
X_train = tf.placeholder(tf.int32, shape=[None])

#input label
Y_train = tf.placeholder(tf.int32,shape=[None,1])

#ensure that the following ops & var are assigned to CPU
with tf.device('/cpu:0'):
    
    #create the embedding variable wich contains the weights
    embedding = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
    
    #create the lookup table for each sample in X_train=>avoiding to use one_hot encoder
    X_embed   = tf.nn.embedding_lookup(embedding,X_train) 
    
    #create variables for the loss function
    nce_weights = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
    nce_biases  = tf.Variable(tf.random_normal([vocabulary_size]))

loss_func = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,biases =nce_biases, labels = Y_train, 
                                          inputs = X_embed, num_sampled = num_sampled, 
                                          num_classes = vocabulary_size ))

optimizer = tf.train.GradientDescentOptimizer(learning_rate)

train_opt = optimizer.minimize(loss_func)

#Define initializer for tensorflow variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    
    #actual initialize the variables
    sess.run(init)
    
    average_loss = 0
    for step in range(1,num_steps+1):
        
        #take new batch of data
        batch_x,batch_y = create_skip_grams(batch_size,num_skips,skip_window,data)
        
        #train
        _,loss = sess.run([train_opt,loss_func], feed_dict={X_train:batch_x, Y_train:batch_y})
        average_loss += loss
        
    
    #normalize embeddings before using them
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keep_dims = True))
    normalized_embedding = embedding/norm
    normalized_embedding_matrix = sess.run(normalized_embedding)

In [99]:
example = normalized_embedding_matrix[word2id["geo"]]
print(example)

[-0.00429874  0.01452199 -0.23464274 -0.06807403  0.14553358 -0.03239543
  0.03707422  0.00805677 -0.01922133 -0.11350194  0.0800465  -0.05834521
 -0.20336981 -0.09596892 -0.08024165 -0.2054867   0.05005193  0.10228378
 -0.07640604 -0.07165489  0.07979203  0.06147172 -0.00795392 -0.01453118
  0.07355349  0.10884865  0.07255135 -0.03741981 -0.07674964 -0.23107524
 -0.07061283 -0.05199072 -0.16668801  0.11865239  0.0188407   0.08940855
  0.10068807  0.02990313 -0.12422778  0.01374637  0.070717    0.3156952
  0.02327913  0.03987381 -0.17283702  0.04432922  0.02615609 -0.00494789
  0.00555748 -0.05504418  0.12788895 -0.16435462 -0.03153923 -0.0540764
 -0.05479822  0.05237039  0.03919997 -0.13805555  0.00802774  0.06373612
 -0.05269733 -0.02619855 -0.01170155  0.07926175  0.07835087 -0.20803055
  0.06597845  0.04542603 -0.13976602 -0.24700351 -0.04573974  0.02913193
  0.0525511  -0.05415343 -0.03480895 -0.0160954   0.049901    0.08804394
  0.02673271 -0.0305111  -0.12584852 -0.07824893  0.1